TREBALL DE FI DE GRAU¶

ACCIDENTS DE TRÀNSIT: Predicció i factors de risc a través de models estadístics¶

1. INTRODUCCIÓ¶

1.1 OBJECTIU¶

1.2 BASE DE DADES¶

2. PREPROCESSAMENT¶

In [68]:
# Llibreries necessàries
import pandas as pd
import seaborn as sns
import numpy as np
import time
import random
import imblearn
import warnings
import matplotlib.pyplot as plt
import plotly.express as px
import matplotlib.cm as cm
import matplotlib.dates as mdates
from datetime import datetime
from scipy.stats import chi2_contingency
from sklearn.impute import IterativeImputer, KNNImputer
from imblearn.over_sampling import SMOTE
from sklearn.experimental import enable_iterative_imputer, enable_halving_search_cv  
from sklearn.model_selection import train_test_split, cross_val_score, HalvingGridSearchCV, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, LabelEncoder, MinMaxScaler
from sklearn.feature_selection import chi2, SelectKBest, f_classif
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
import xgboost as xgb
In [2]:
# Importem les dades
dades = pd.read_csv('roadaccident.csv')
dades.head()
Out[2]:
Accident_Index Accident Date Day_of_Week Junction_Control Junction_Detail Accident_Severity Latitude Light_Conditions Local_Authority_(District) Carriageway_Hazards ... Number_of_Casualties Number_of_Vehicles Police_Force Road_Surface_Conditions Road_Type Speed_limit Time Urban_or_Rural_Area Weather_Conditions Vehicle_Type
0 200901BS70001 1/1/2021 Thursday Give way or uncontrolled T or staggered junction Serious 51.512273 Daylight Kensington and Chelsea None ... 1 2 Metropolitan Police Dry One way street 30 15:11 Urban Fine no high winds Car
1 200901BS70002 1/5/2021 Monday Give way or uncontrolled Crossroads Serious 51.514399 Daylight Kensington and Chelsea None ... 11 2 Metropolitan Police Wet or damp Single carriageway 30 10:59 Urban Fine no high winds Taxi/Private hire car
2 200901BS70003 1/4/2021 Sunday Give way or uncontrolled T or staggered junction Slight 51.486668 Daylight Kensington and Chelsea None ... 1 2 Metropolitan Police Dry Single carriageway 30 14:19 Urban Fine no high winds Taxi/Private hire car
3 200901BS70004 1/5/2021 Monday Auto traffic signal T or staggered junction Serious 51.507804 Daylight Kensington and Chelsea None ... 1 2 Metropolitan Police Frost or ice Single carriageway 30 8:10 Urban Other Motorcycle over 500cc
4 200901BS70005 1/6/2021 Tuesday Auto traffic signal Crossroads Serious 51.482076 Darkness - lights lit Kensington and Chelsea None ... 1 2 Metropolitan Police Dry Single carriageway 30 17:25 Urban Fine no high winds Car

5 rows × 21 columns

In [3]:
dim = dades.shape
print(dim)
(307973, 21)
In [4]:
# Recompte NA
warnings.simplefilter(action='ignore', category=FutureWarning)

recompte_NA = dades.isna().sum()
recompte_NA = pd.DataFrame({'VARIABLE': recompte_NA.index, 'NA': recompte_NA.values})

recompte_NA['% NA'] = (recompte_NA['NA'] / len(dades)) * 100   # percentatge (%)

plt.figure(figsize=(10, 6))
sns.barplot(x = 'VARIABLE', y = '% NA', data = recompte_NA, hue = 'VARIABLE', color = '#228B22', legend=False)
plt.xticks(rotation=45, ha='right')
plt.title('PERCENTATGE DE VALORS PERDUTS PER VARIABLE', fontsize = 15)
plt.ylabel('% NA', fontsize = 11)

for index, value in enumerate (recompte_NA['% NA']):
    plt.text(index, value, f'{round(value,3)}%', ha = 'center', va = 'bottom', 
             color = 'black', rotation = 0, fontsize = 8, weight = 'bold')
plt.savefig('NA.png', bbox_inches='tight')
plt.show()
In [5]:
# Valors atípics 
dades_numeriques = dades[['Number_of_Casualties','Number_of_Vehicles','Speed_limit']]
columnes = dades_numeriques.columns
n = len(dades)

for i in columnes:
    Q1 = dades[i].quantile(0.25)
    Q3 = dades[i].quantile(0.75)
    
    Rang_Interquartilic = Q3 - Q1
    
    L_sup = Q3 + 1.5 * Rang_Interquartilic
    L_inf = Q1 - 1.5 * Rang_Interquartilic
    
    Valors_atipics = dades[(dades[i] < L_inf) | (dades[i] > L_sup)]
    Num_outliers = len(Valors_atipics)
    Percentatge_outliers = (Num_outliers*100)/n

    print('\n La quantitat de valors atípics de la variable',i ,f'és {Num_outliers}, que representa un {round(Percentatge_outliers,2)} % del total dels casos')
 La quantitat de valors atípics de la variable Number_of_Casualties és 72880, que representa un 23.66 % del total dels casos

 La quantitat de valors atípics de la variable Number_of_Vehicles és 6803, que representa un 2.21 % del total dels casos

 La quantitat de valors atípics de la variable Speed_limit és 0, que representa un 0.0 % del total dels casos
In [6]:
# Casos duplicats
dades.duplicated().any()
duplicats = dades[dades.duplicated()]
num_duplicats = duplicats.shape[0]

# Eliminem casos duplicats
dades = dades.drop_duplicates()
In [7]:
warnings.filterwarnings("ignore")

# Creem variables noves
dades['Accident Date'] = pd.to_datetime(dades['Accident Date'], format='%m/%d/%Y')
dades['Any_Accident'] = dades['Accident Date'].dt.year
dades['Mes_Accident'] = dades['Accident Date'].dt.month
dades['Dia_Accident'] = dades['Accident Date'].dt.day

# Eliminem variables redundants
dades = dades.drop(columns = ['Accident_Index']) 
dades.head()
Out[7]:
Accident Date Day_of_Week Junction_Control Junction_Detail Accident_Severity Latitude Light_Conditions Local_Authority_(District) Carriageway_Hazards Longitude ... Road_Surface_Conditions Road_Type Speed_limit Time Urban_or_Rural_Area Weather_Conditions Vehicle_Type Any_Accident Mes_Accident Dia_Accident
0 2021-01-01 Thursday Give way or uncontrolled T or staggered junction Serious 51.512273 Daylight Kensington and Chelsea None -0.201349 ... Dry One way street 30 15:11 Urban Fine no high winds Car 2021 1 1
1 2021-01-05 Monday Give way or uncontrolled Crossroads Serious 51.514399 Daylight Kensington and Chelsea None -0.199248 ... Wet or damp Single carriageway 30 10:59 Urban Fine no high winds Taxi/Private hire car 2021 1 5
2 2021-01-04 Sunday Give way or uncontrolled T or staggered junction Slight 51.486668 Daylight Kensington and Chelsea None -0.179599 ... Dry Single carriageway 30 14:19 Urban Fine no high winds Taxi/Private hire car 2021 1 4
3 2021-01-05 Monday Auto traffic signal T or staggered junction Serious 51.507804 Daylight Kensington and Chelsea None -0.203110 ... Frost or ice Single carriageway 30 8:10 Urban Other Motorcycle over 500cc 2021 1 5
4 2021-01-06 Tuesday Auto traffic signal Crossroads Serious 51.482076 Darkness - lights lit Kensington and Chelsea None -0.173445 ... Dry Single carriageway 30 17:25 Urban Fine no high winds Car 2021 1 6

5 rows × 23 columns

In [ ]:
# Comprovem de quina classe és cada variable
dades.dtypes

Analitzem totes les variables de la base de dades:

In [8]:
# Accident Date
summary_Accident_Date = dades['Accident Date'].describe(datetime_is_numeric=True)

# Day_of_Week
dades['Day_of_Week'] = dades['Day_of_Week'].astype('category')
summary_Day_of_Week = dades['Day_of_Week'].describe()
classes_Day_of_Week = dades['Day_of_Week'].cat.categories
casos_per_classe_Day_of_Week = dades['Day_of_Week'].value_counts()
##print(casos_per_classe_Day_of_Week)

# Junction_Control
dades['Junction_Control'] = dades['Junction_Control'].astype('category')
summary_Junction_Control = dades['Junction_Control'].describe()
classes_Junction_Control = dades['Junction_Control'].cat.categories
casos_per_classe_Junction_Control = dades['Junction_Control'].value_counts()
##print(casos_per_classe_Junction_Control)

# Junction_Detail
dades['Junction_Detail'] = dades['Junction_Detail'].astype('category')
summary_Junction_Detail = dades['Junction_Detail'].describe()
classes_Junction_Detail = dades['Junction_Detail'].cat.categories
casos_per_classe_Junction_Detail = dades['Junction_Detail'].value_counts()
##print(casos_per_classe_Junction_Detail)

# Accident_Severity
dades['Accident_Severity'] = dades['Accident_Severity'].astype('category')
summary_Accident_Severity = dades['Accident_Severity'].describe()
classes_Accident_Severity = dades['Accident_Severity'].cat.categories
casos_per_classe_Accident_Severity = dades['Accident_Severity'].value_counts()
##print(casos_per_classe_Accident_Severity)

# Latitude
summary_Latitude = dades['Latitude'].describe()

# Light_Conditions
dades['Light_Conditions'] = dades['Light_Conditions'].astype('category')
summary_Light_Conditions = dades['Light_Conditions'].describe()
classes_Light_Conditions = dades['Light_Conditions'].cat.categories
casos_per_classe_Light_Conditions = dades['Light_Conditions'].value_counts()
##print(casos_per_classe_Light_Conditions)

# Local_Authority_(District)
dades['Local_Authority_(District)'] = dades['Local_Authority_(District)'].astype('category')
summary_Local_Authority_District = dades['Local_Authority_(District)'].describe()
classes_Local_Authority_District = dades['Local_Authority_(District)'].cat.categories
casos_per_classe_Local_Authority_District = dades['Local_Authority_(District)'].value_counts()
##print(casos_per_classe_Local_Authority_District)

# Carriageway_Hazards
dades['Carriageway_Hazards'] = dades['Carriageway_Hazards'].astype('category')
summary_Carriageway_Hazards = dades['Carriageway_Hazards'].describe()
classes_Carriageway_Hazards = dades['Carriageway_Hazards'].cat.categories
casos_per_classe_Carriageway_Hazards = dades['Carriageway_Hazards'].value_counts()
##print(casos_per_classe_Carriageway_Hazards)

# Longitude
summary_Longitude = dades['Longitude'].describe()

# Number_of_Casualties
summary_Number_of_Casualties = dades['Number_of_Casualties'].describe()
taula_freq_Number_of_Casualties = dades['Number_of_Casualties'].value_counts()

# Number_of_Vehicles
summary_Number_of_Vehicles = dades['Number_of_Vehicles'].describe()
taula_freq_Number_of_Vehicles = dades['Number_of_Vehicles'].value_counts()

# Police_Force
dades['Police_Force'] = dades['Police_Force'].astype('category')
summary_Police_Force = dades['Police_Force'].describe()
classes_Police_Force = dades['Police_Force'].cat.categories
casos_per_classe_Police_Force = dades['Police_Force'].value_counts()
##print(casos_per_classe_Police_Force)

# Road_Surface_Conditions
dades['Road_Surface_Conditions'] = dades['Road_Surface_Conditions'].astype('category')
summary_Road_Surface_Conditions = dades['Road_Surface_Conditions'].describe()
classes_Road_Surface_Conditions = dades['Road_Surface_Conditions'].cat.categories
casos_per_classe_Road_Surface_Conditions = dades['Road_Surface_Conditions'].value_counts()
##print(casos_per_classe_Road_Surface_Conditions)

# Road_Type
dades['Road_Type'] = dades['Road_Type'].astype('category')
summary_Road_Type = dades['Road_Type'].describe()
classes_Road_Type = dades['Road_Type'].cat.categories
casos_per_classe_Road_Type = dades['Road_Type'].value_counts()
##print(casos_per_classe_Road_Type)

# Speed_limit
summary_Speed_limit = dades['Speed_limit'].describe()
taula_freq_Speed_limit = dades['Speed_limit'].value_counts()

# Time
dades['Time'] = dades['Time'].astype('category')
summary_Time = dades['Time'].describe()
classes_Time = dades['Time'].cat.categories
casos_per_classe_Time = dades['Time'].value_counts()
##print(casos_per_classe_Time)

# Urban_or_Rural_Area
dades['Urban_or_Rural_Area'] = dades['Urban_or_Rural_Area'].astype('category')
summary_Urban_or_Rural_Area = dades['Urban_or_Rural_Area'].describe()
classes_Urban_or_Rural_Area = dades['Urban_or_Rural_Area'].cat.categories
casos_per_classe_Urban_or_Rural_Area = dades['Urban_or_Rural_Area'].value_counts()
##print(casos_per_classe_Urban_or_Rural_Area)

# Weather_Conditions 
dades['Weather_Conditions'] = dades['Weather_Conditions'].astype('category')
summary_Weather_Conditions = dades['Weather_Conditions'].describe()
classes_Weather_Conditions = dades['Weather_Conditions'].cat.categories
casos_per_classe_Weather_Conditions = dades['Weather_Conditions'].value_counts()
##print(casos_per_classe_Weather_Conditions)

# Vehicle_Type
dades['Vehicle_Type'] = dades['Vehicle_Type'].astype('category')
summary_Vehicle_Type = dades['Vehicle_Type'].describe()
classes_Vehicle_Type = dades['Vehicle_Type'].cat.categories
casos_per_classe_Vehicle_Type = dades['Vehicle_Type'].value_counts()
##print(casos_per_classe_Vehicle_Type)

# Any_Accident
dades['Any_Accident'] = dades['Any_Accident'].astype('category')
summary_Any_Accident = dades['Any_Accident'].describe()
classes_Any_Accident = dades['Any_Accident'].cat.categories
casos_per_classe_Any_Accident = dades['Any_Accident'].value_counts()
##print(casos_per_classe_Any_Accident)

# Month_Accident
summary_Mes_Accident = dades['Mes_Accident'].describe()
taula_freq_Mes_Accident = dades['Mes_Accident'].value_counts()
In [9]:
# Comprovem el canvi de tipus object a categòric
dades.dtypes
Out[9]:
Accident Date                 datetime64[ns]
Day_of_Week                         category
Junction_Control                    category
Junction_Detail                     category
Accident_Severity                   category
Latitude                             float64
Light_Conditions                    category
Local_Authority_(District)          category
Carriageway_Hazards                 category
Longitude                            float64
Number_of_Casualties                   int64
Number_of_Vehicles                     int64
Police_Force                        category
Road_Surface_Conditions             category
Road_Type                           category
Speed_limit                            int64
Time                                category
Urban_or_Rural_Area                 category
Weather_Conditions                  category
Vehicle_Type                        category
Any_Accident                        category
Mes_Accident                           int64
Dia_Accident                           int64
dtype: object

3. ANÀLISI DESCRIPTIVA¶

3.1 ANÀLISI DESCRIPTIVA QUANTITATIVA¶

Gràfics previs a realitzar canvis en certes variables.

In [10]:
# Univariant

# Number_of_Casualties
frequencia_casualties = dades['Number_of_Casualties'].value_counts().sort_index()
plt.figure(figsize=(8, 6))
plt.bar(frequencia_casualties.index, frequencia_casualties.values, color = '#008000')
plt.xlim(0, max(frequencia_casualties.index) + 1)
plt.xlabel('Number of Casualties')
plt.ylabel('Freqüència')
plt.title('FREQÜÈNCIA D\'ACCIDENTS PER NOMBRE DE VÍCTIMES', fontsize = 15)
for i, v in zip(frequencia_casualties.index, frequencia_casualties.values):
    plt.text(i, v + 0.1, str(v), ha='center', va='bottom', color='black', fontsize=6)
#plt.savefig('gqt_casualties_1.png', bbox_inches='tight')
plt.show()


# Number_of_Vehicles
frequencia_vehicles = dades['Number_of_Vehicles'].value_counts().sort_index()
plt.figure(figsize=(8, 6))
plt.bar(frequencia_vehicles.index, frequencia_vehicles.values, color = '#008000', width = 0.8)
plt.xlim(0, max(frequencia_vehicles.index) + 1)
plt.xlabel('Number of Vehicles')
plt.ylabel('Freqüència')
plt.title('FREQÜÈNCIA D\'ACCIDENTS PER NOMBRE DE VEHICLES', fontsize = 15)
for i, v in zip(frequencia_vehicles.index, frequencia_vehicles.values):
    plt.text(i , v + 0.1, str(v), ha='center', va='bottom', color='black', fontsize=6)
#plt.savefig('gqt_NumVehicles_1.png', bbox_inches='tight')
plt.show()


# Speed_Limit
frequencia_velocitat = dades['Speed_limit'].value_counts().sort_index()
plt.figure(figsize=(8, 6))
plt.bar(frequencia_velocitat.index, frequencia_velocitat.values, color = '#008000', width = 4)
plt.xlabel('Speed Limit')
plt.ylabel('Freqüència')
plt.title('FREQÜÈNCIA D\'ACCIDENTS DELS DIFERENTS LÍMITS DE VELOCITAT', fontsize = 15)
for i, v in zip(frequencia_velocitat.index, frequencia_velocitat.values):
    plt.text(i , v + 0.1, str(v), ha='center', va='bottom', color='black', fontsize=6)
#plt.savefig('gqt_SpeedLimit_1.png', bbox_inches='tight')
plt.show()

# Mes_Accident
x = ['Gener','Febrer','Març','Abril','Maig','Juny','Juliol',' Agost', 'Setembre','Octubre', 'Novembre', 'Desembre']
frequencia_mes = dades['Mes_Accident'].value_counts().sort_index()
plt.figure(figsize=(8, 6))
plt.bar(frequencia_mes.index, frequencia_mes.values, color = '#008000', width = 0.9)
plt.xlabel('Mes Accident')
plt.ylabel('Freqüència')
plt.title('FREQÜÈNCIA D\'ACCIDENTS PER MES', fontsize = 15)
plt.xticks(ticks = range(1, len(x) + 1), labels = x, rotation = 90, fontsize = 8)
#plt.savefig('gq_MesAccidentt.png', bbox_inches='tight')
plt.show()

# Dia_Accident
x = list(range(1,32))
frequencia_dia = dades['Dia_Accident'].value_counts().sort_index()
plt.figure(figsize=(8, 6))
plt.bar(frequencia_dia.index, frequencia_dia.values, color = '#008000', width = 0.9)
plt.xlabel('Dia Accident')
plt.ylabel('Freqüència')
plt.title('FREQÜÈNCIA D\'ACCIDENTS PER DIA', fontsize = 15)
plt.xticks(ticks = range(1, len(x) + 1), labels = x, rotation = 90, fontsize = 8)
#plt.savefig('gq_DiaAccidentt.png', bbox_inches='tight')
plt.show()

# UNIVARIANT - QUALITATIVA

# Vehicle_Type
frequencia_vehicletype = dades['Vehicle_Type'].value_counts().sort_index()
plt.figure(figsize=(8, 6))
plt.bar(frequencia_vehicletype.index, frequencia_vehicletype.values, color = '#008000', width = 0.9)
plt.xlabel('Vehicle Type')
plt.ylabel('Freqüència')
plt.title('FREQÜÈNCIA D\'ACCIDENTS DELS DIFERENTS TIPUS DE VEHICLES', fontsize = 15)
plt.xticks(rotation=90, fontsize=8)
for i, v in zip(frequencia_vehicletype.index, frequencia_vehicletype.values):
    plt.text(i , v + 0.1, str(v), ha='center', va='bottom', color='black', fontsize=7)
#plt.savefig('gq_TypeVehicle_1.png', bbox_inches='tight')
plt.show()
In [12]:
# Bivariant

# Gràfic dispersió Speed Limit i Number of Casualties
plt.figure(figsize=(10, 8))
plt.scatter(dades['Speed_limit'], dades['Number_of_Casualties'],color = '#008000')
plt.yticks(range(0, max(dades['Number_of_Casualties']) + 1, 5))

plt.xlabel('Speed Limit')
plt.ylabel('Number of Casualties')
plt.title('RELACIÓ ENTRE EL LÍMIT DE VELOCITAT I EL NOMBRE DE VÍCTIMES EN ELS ACCIDENTS', fontsize = 15)
#plt.savefig('gqtqt_SpeedCasualties_2.png', bbox_inches='tight')
plt.show()

correlation1 = np.corrcoef(dades['Speed_limit'], dades['Number_of_Casualties'])[0, 1]

# Gràfic dispersió Number of vehicles i Number of Casualties
plt.figure(figsize=(10, 8))
plt.scatter(dades['Number_of_Vehicles'], dades['Number_of_Casualties'],color = '#008000')
plt.yticks(range(0, max(dades['Number_of_Casualties']) + 1, 5))

plt.xlabel('Number of Vehicles')
plt.ylabel('Number of Casualties')
plt.title('RELACIÓ ENTRE EL NOMBRE DE VEHICLES I EL NOMBRE DE VÍCTIMES EN ELS ACCIDENTS', fontsize = 15)
#plt.savefig('gqtqt_NumVehiclesCasualties_2.png', bbox_inches='tight')
plt.show()

correlation2 = np.corrcoef(dades['Number_of_Vehicles'], dades['Number_of_Casualties'])[0, 1]

Apliquem les modificacions necessàries:

  • Accident Severity: Passar fetal a fatal (error ortogràfic)
  • Junction_Control: Passar 'Auto traffic sigl' a 'Auto traffic signal' (error ortogràfic)
  • Number_of_Casualities: posar >=4 com a nova categoria en comptes de (4,5,6,7,8,9...)
  • Number_of_Vehicles: posar >=4 com a nova categoria en comptes de (4,5,6,7,8,9...)
In [13]:
# Accident_Severity - error ortogràfic
dades['Accident_Severity'] = dades['Accident_Severity'].replace('Fetal','Fatal') 

# Junction_Control - error ortogràfic
dades['Junction_Control'] = dades['Junction_Control'].replace('Auto traffic sigl',
                                                              'Auto traffic signal')

# Number_of_Casualties
dades['Number_of_Casualties'] = pd.Categorical(dades['Number_of_Casualties'], ordered=True)

if '4 o més' not in dades['Number_of_Casualties'].cat.categories:
    dades['Number_of_Casualties'] = dades['Number_of_Casualties'].cat.add_categories('4 o més')
    
dades.loc[dades['Number_of_Casualties'] >= 4, 'Number_of_Casualties'] = '4 o més'

categoria_a_eliminar = [42,43,48,40,19,21,22,24,26,27,13,14,15,16,17,18,12,11,10,9,8,7,6,4,5]
categories_presents = dades['Number_of_Casualties'].cat.categories
categories_a_eliminar2 = [categoria for categoria in categoria_a_eliminar if categoria in categories_presents]

if categories_a_eliminar2:
    dades['Number_of_Casualties'] = dades['Number_of_Casualties'].cat.remove_categories(categories_a_eliminar2)

# Number_of_Vehicles
dades['Number_of_Vehicles'] = pd.Categorical(dades['Number_of_Vehicles'], ordered=True)

if '4 o més' not in dades['Number_of_Vehicles'].cat.categories:
    dades['Number_of_Vehicles'] = dades['Number_of_Vehicles'].cat.add_categories('4 o més')
    
dades.loc[dades['Number_of_Vehicles'] >=4, 'Number_of_Vehicles'] = "4 o més"

categoria_a_eliminar2 = [4,5,6,7,8,9,10,11,12,13,14,16,19,32]
categories_presents2 = dades['Number_of_Vehicles'].cat.categories
categories_a_eliminar22 = [categoria for categoria in categoria_a_eliminar2 if categoria in categories_presents2]

if categories_a_eliminar22:
    dades['Number_of_Vehicles'] = dades['Number_of_Vehicles'].cat.remove_categories(categories_a_eliminar22)
    
# Speed_Limit
dades['Speed_limit'] = dades['Speed_limit'].astype('category')
categories_a_eliminar3 = [10,15]
dades['Speed_limit'] = dades['Speed_limit'].cat.remove_categories(categories_a_eliminar3)


# Vehicle_Type
dades['Vehicle_Type'] = pd.Categorical(dades['Vehicle_Type'], ordered=True)

dades.loc[dades['Vehicle_Type'] == "Pedal cycle", 'Vehicle_Type'] = "Other vehicle"
dades.loc[dades['Vehicle_Type'] == "Ridden horse", 'Vehicle_Type'] = "Other vehicle"

categoria_a_eliminar3 = ["Pedal cycle", "Ridden horse"]
categories_presents3 = dades['Vehicle_Type'].cat.categories
categories_a_eliminar33 = [categoria for categoria in categoria_a_eliminar3 if categoria in categories_presents3]

if categories_a_eliminar33:
    dades['Vehicle_Type'] = dades["Vehicle_Type"].cat.remove_categories(categories_a_eliminar33)
    
# Light_Conditions
dades['Light_Conditions'] = pd.Categorical(dades['Light_Conditions'], ordered=True)

dades.loc[dades['Light_Conditions'] == "Darkness - lights unlit", 'Light_Conditions'] = "Darkness - no lighting"

categoria_a_eliminar4 = ["Darkness - lights unlit"]
categories_presents4 = dades['Light_Conditions'].cat.categories
categories_a_eliminar44 = [categoria for categoria in categoria_a_eliminar4 if categoria in categories_presents4]

if categories_a_eliminar44:
    dades['Light_Conditions'] = pd.Categorical(dades['Light_Conditions'].cat.remove_categories(categories_a_eliminar44))

Gràfics després d'aplicar modificacions:

In [15]:
# Univariant

# Number_of_Casualties
categories = ['1', '2', '3', '4 o més']
frequencia_casualties = dades['Number_of_Casualties'].value_counts().sort_index()
plt.figure(figsize=(8, 6))
plt.bar(categories, frequencia_casualties, color = '#008000')
plt.xlabel('Number of Casualties', fontsize = 16)
plt.ylabel('Freqüència', fontsize = 16)
plt.xticks(fontsize = 16)
plt.yticks(fontsize = 16)
plt.title('FREQÜÈNCIA D\'ACCIDENTS PER NOMBRE DE VÍCTIMES', fontsize = 17)
for i, v in zip(categories, frequencia_casualties):
    plt.text(i, v + 0.1, str(v), ha='center', va='bottom', color='black', fontsize = 14)
#plt.savefig('gqt_casualties_2.png', bbox_inches='tight')
plt.show()

# Number_of_Vehicles
categories = ['1', '2', '3', '4 o més']
frequencia_vehicles = dades['Number_of_Vehicles'].value_counts().sort_index()
plt.figure(figsize=(8, 6))
plt.bar(categories, frequencia_vehicles, color = '#008000', width = 0.8)
plt.xlabel('Number of Vehicles', fontsize = 16)
plt.ylabel('Freqüència', fontsize = 16)
plt.xticks(fontsize = 16)
plt.yticks(fontsize = 16)
plt.title('FREQÜÈNCIA D\'ACCIDENTS PER NOMBRE DE VEHÍCLES', fontsize = 17)
for i, v in zip(categories, frequencia_vehicles):
    plt.text(i , v + 0.1, str(v), ha='center', va='bottom', color='black', fontsize = 14)
#plt.savefig('gqt_NumVehicles_2.png', bbox_inches='tight')
plt.show()

# Speed_Limit
dades = dades.dropna(subset = ['Speed_limit'])
frequencia_velocitat = dades['Speed_limit'].value_counts().sort_index()
plt.figure(figsize=(8, 6))
plt.bar(frequencia_velocitat.index, frequencia_velocitat.values, color = '#008000', width = 4)
plt.xlabel('Speed Limit', fontsize = 15)
plt.ylabel('Freqüència', fontsize = 15)
plt.xticks(fontsize = 14)
plt.yticks(fontsize = 14)
plt.title('FREQÜÈNCIA D\'ACCIDENTS AMB ELS DIFERENTS LÍMITS DE VELOCITAT', fontsize = 15)
for i, v in zip(frequencia_velocitat.index, frequencia_velocitat.values):
    plt.text(i , v + 0.1, str(v), ha='center', va='bottom', color='black', fontsize = 14)
#plt.savefig('gqt_SpeedLimit_2.png', bbox_inches='tight')
plt.show()

# Any_Accident
frequencia_any = dades['Any_Accident'].value_counts().sort_index()
plt.figure(figsize=(8, 8))
plt.pie(frequencia_any, labels=frequencia_any.index, autopct='%1.1f%%', 
        textprops={'fontsize': 14, 'color': 'white'}, startangle=90, 
        colors=['#008000', '#DAA520'])
plt.legend(labels=frequencia_any.index, title='Year', loc='upper right', fontsize = 15, title_fontsize = 17)
plt.title('FREQÜÈNCIA D\'ACCIDENTS PER ANY', fontsize = 15)
#plt.savefig('gqt_YearAccident_2.png', bbox_inches='tight')
plt.show()
In [17]:
# Bivariant (1)

## Speed Limit + Number of Casualties 

### Barres apilades
cross_table = pd.crosstab(dades['Speed_limit'],dades['Number_of_Casualties'])
cross_table = cross_table.div(cross_table.sum(axis=1), axis=0)
cross_table.plot(kind='bar', color = ['#008000', '#DAA520', '#FF0000', '#5F9EA0'], stacked=True, figsize=(9, 7))

plt.legend(title='Number of Casualties', bbox_to_anchor=(1.05, 1), loc='upper left')

plt.xlabel("Speed Limit", fontsize = 11)
plt.yticks(fontsize = 11)
plt.xticks(fontsize = 11)
plt.title('PROPORCIÓ DEL NOMBRE DE VÍCTIMES EN ELS DIFERENTS\nLÍMITS DE VELOCITAT', fontsize = 15)
#plt.savefig('gqtqt_SpeedCasualties_1.png', bbox_inches='tight')
plt.show()

### Diagrama de violí
dades_new = dades.copy()

speed_limit_order = sorted(dades_new['Speed_limit'].unique())
dades_new['Speed_limit_numeric'] = pd.to_numeric(dades_new['Speed_limit'], errors='coerce')

plt.figure(figsize=(10, 8))
sns.violinplot(x=dades_new['Number_of_Casualties'], y=dades_new['Speed_limit_numeric'], 
               scale="width", palette= ['#008000', '#DAA520', '#FF0000', '#5F9EA0'])
plt.yticks(speed_limit_order, fontsize = 15)
plt.xticks(ticks=[0, 1, 2, 3], labels=['1', '2', '3', '4 o més'], fontsize=15)
plt.xlabel('Number of Casualties', fontsize =  15)
plt.ylabel('Speed Limit', fontsize = 15)
plt.title('GRÀFIC DE VIOLÍ DEL LÍMIT DE VELOCITAT SEGONS\nEL NOMBRE DE VÍCTIMES', fontsize = 17)
#plt.savefig('gqtqt_SpeedCasualties3.png', bbox_inches='tight')
plt.show()

### BoxPlot
plt.figure(figsize=(10, 8))
sns.boxplot(x=dades_new['Number_of_Casualties'], y=dades_new['Speed_limit_numeric'], 
            palette= ['#008000', '#DAA520', '#FF0000', '#5F9EA0'])
plt.yticks(speed_limit_order)
plt.xlabel('Number of Casualties', fontsize = 16)
plt.ylabel('Speed Limit', fontsize = 16)
plt.title('GRÀFIC DE CAIXES DEL LÍMIT DE VELOCITAT SEGONS\nEL NOMBRE DE VÍCTIMES', fontsize = 17)
plt.xticks(ticks=[0, 1, 2, 3], labels=['1', '2', '3', '4 o més'], fontsize=14)
plt.yticks(fontsize=14)
#plt.savefig('gqtqt_SpeedCasualties_boxplot.png', bbox_inches='tight')
plt.show()
In [18]:
# Bivariant (2)

## Road Surface Conditions i Speed limit
dades['Speed_limit'] = pd.to_numeric(dades['Speed_limit'], errors='coerce')

plt.figure(figsize=(10, 8))
sns.boxplot(x='Road_Surface_Conditions', y='Speed_limit', data=dades, color = "#008000")
plt.xlabel('Road Surface Conditions')
plt.ylabel('Speed Limit')
plt.xticks(fontsize = 11)
plt.yticks(fontsize = 11)
plt.title('ANÀLISI DELS LÍMITS DE VELOCITAT EN DIFERENTS CONDICIONS\nDE LA SUPERFÍCIE DE LA CARRETERA', fontsize = 15)
#plt.savefig('gqtql_SurfaceSpeed.png', bbox_inches='tight')
plt.show()

## Accident_Severity i Speed limit

### Boxplot
dades['Speed_limit'] = pd.to_numeric(dades['Speed_limit'], errors='coerce')

plt.figure(figsize=(10, 8))
sns.boxplot(x='Accident_Severity', y='Speed_limit', data=dades, palette=["red","orange","green"])
plt.xlabel('Accident_Severity')
plt.ylabel('Speed Limit')
plt.xticks(fontsize = 11)
plt.yticks(fontsize = 11)
plt.title('ANÀLISI DELS LÍMITS DE VELOCITAT EN DIFERENTS\nGRAUS DE GRAVETAT DELS ACCIDENTS', fontsize = 15)
#plt.savefig('gqtql_RespSpeed_1.png', bbox_inches='tight')
plt.show()

### Diagrama de violí
dades['Speed_limit'] = pd.to_numeric(dades['Speed_limit'], errors='coerce')

plt.figure(figsize=(10, 8))
sns.violinplot(x='Accident_Severity', y='Speed_limit', data=dades, palette=["red","orange","green"])
plt.xlabel('Accident Severity', fontsize = 16)
plt.ylabel('Speed Limit', fontsize = 16)
plt.title('DISTRIBUCIÓ DELS LÍMITS DE VELOCITAT SEGONS\nLA GRAVETAT DELS ACCIDENTS', fontsize = 17)
plt.xticks(fontsize = 16)
plt.yticks(fontsize = 16)
#plt.savefig('gqtql_RespSpeed_2.png', bbox_inches='tight')
plt.show()


## Accident_Severity i Number_of_Casualties
dades['Number_of_Casualties'] = pd.Categorical(dades['Number_of_Casualties'], ordered=True)

if '4' not in dades['Number_of_Casualties'].cat.categories:
    dades['Number_of_Casualties'] = dades['Number_of_Casualties'].cat.add_categories('4')
    
dades.loc[dades['Number_of_Casualties'] == "4 o més", 'Number_of_Casualties'] = '4'

categoria_a_eliminar = ["4 o més"]
categories_presents = dades['Number_of_Casualties'].cat.categories
categories_a_eliminar2 = [categoria for categoria in categoria_a_eliminar if categoria in categories_presents]

if categories_a_eliminar2:
    dades['Number_of_Casualties'] = dades['Number_of_Casualties'].cat.remove_categories(categories_a_eliminar2)

dades['Number_of_Casualties'] = pd.to_numeric(dades['Number_of_Casualties'], errors='coerce')

plt.figure(figsize=(10, 8))
sns.violinplot(x='Accident_Severity', y='Number_of_Casualties', data=dades, palette=["red","orange","green"])
plt.xlabel('Accident Severity')
plt.ylabel('Number_of_Casualties')
plt.title('DISTRIBUCIÓ DEL NOMBRE DE VÍCITTES SEGONS\nLA GRAVETAT DELS ACCIDENTS', fontsize = 15)
#plt.savefig('gqtql_RespCasualties_1.png', bbox_inches='tight')
plt.show()

## Accident_Severity i Number_of_Vehicles

dades['Number_of_Vehicles'] = pd.Categorical(dades['Number_of_Vehicles'], ordered=True)

if '4' not in dades['Number_of_Vehicles'].cat.categories:
    dades['Number_of_Vehicles'] = dades['Number_of_Vehicles'].cat.add_categories('4')
    
dades.loc[dades['Number_of_Vehicles'] == "4 o més", 'Number_of_Vehicles'] = "4"

categoria_a_eliminar2 = ["4 o més"]
categories_presents2 = dades['Number_of_Vehicles'].cat.categories
categories_a_eliminar22 = [categoria for categoria in categoria_a_eliminar2 if categoria in categories_presents2]

if categories_a_eliminar22:
    dades['Number_of_Vehicles'] = dades['Number_of_Vehicles'].cat.remove_categories(categories_a_eliminar22)

dades['Number_of_Vehicles'] = pd.to_numeric(dades['Number_of_Vehicles'], errors='coerce')

plt.figure(figsize=(10, 8))
sns.violinplot(x='Accident_Severity', y='Number_of_Vehicles', data=dades, palette=["red","orange","green"])
plt.xlabel('Accident Severity')
plt.ylabel('Number_of_Vehicles')
plt.xticks(fontsize = 11)
plt.yticks(fontsize = 11)
plt.title('DISTRIBUCIÓ DEL NOMBRE DE VEHICLES SEGONS\nLA GRAVETAT DELS ACCIDENTS', fontsize = 15)
#plt.savefig('gqtql_RespNumVehicles_1.png', bbox_inches='tight')
plt.show()

### Boxplot
dades['Number_of_Vehicles'] = pd.to_numeric(dades['Number_of_Vehicles'], errors='coerce')

plt.figure(figsize=(10, 8))
sns.boxplot(x='Accident_Severity', y='Number_of_Vehicles', data=dades, palette=["red","orange","green"])
plt.xlabel('Accident Severity')
plt.ylabel('Number of Vehicles')
plt.xticks(fontsize = 11)
plt.yticks(fontsize = 11)
plt.title('DISTRIBUCIÓ DEL NOMBRE DE VEHICLES SEGONS\nLA GRAVETAT DELS ACCIDENTS', fontsize = 15)
#plt.savefig('gqtql_RespNumVehicles_1.png', bbox_inches='tight')
plt.show()

## Accident_Severity i Mes_Accident 

dades['Mes_Accident'] = pd.to_numeric(dades['Mes_Accident'], errors='coerce')

plt.figure(figsize=(10, 8))
sns.boxplot(x='Accident_Severity', y='Mes_Accident', data=dades, palette=["red","orange","green"])
plt.xlabel('Accident_Severity')
plt.ylabel('Month')
plt.xticks(fontsize = 11)
plt.yticks(fontsize = 11)
plt.title('DISTRIBUCIÓ DEL MES DELS ACCIDENTS SEGONS\nLA DIFERENT GRAVETAT DELS ACCIDENTS', fontsize = 15)
#plt.savefig('gqtql_RespMonth_1.png', bbox_inches='tight')
plt.show()
In [24]:
# Violin plots discretitzats per Any_Accident

dades['Speed_limit'] = pd.to_numeric(dades['Speed_limit'], errors='coerce')

plt.figure(figsize=(10, 8))
sns.violinplot(x='Accident_Severity', y='Speed_limit', data=dades, hue = 'Any_Accident', 
               palette=["orange","red"], split=True)
plt.xlabel('Accident Severity', fontsize = 23)
plt.ylabel('Speed Limit', fontsize = 23)
plt.xticks(fontsize = 23)
plt.yticks(fontsize = 23)
plt.title('LÍMITS DE VELOCITAT SEGONS LA GRAVETAT DELS\nACCIDENTS PER ANY', fontsize = 25, pad = 80)
plt.legend(title = 'Any',loc='upper center', bbox_to_anchor=(0.5, 1.18), fontsize = 21, title_fontsize = 21, ncol = 2)
#plt.savefig('violin3_RespSpeedYear.png', bbox_inches='tight')
plt.show()

dades['Number_of_Casualties'] = pd.to_numeric(dades['Number_of_Casualties'], errors='coerce')

plt.figure(figsize=(10, 8))
sns.violinplot(x='Accident_Severity', y='Number_of_Casualties', data=dades, hue = 'Any_Accident', 
               palette=["orange","red"], split = True)
plt.xlabel('Accident Severity', fontsize = 23)
plt.ylabel('Number_of_Casualties', fontsize = 23)
plt.xticks(fontsize = 23)
plt.yticks(fontsize = 23)
plt.title('DISTRIBUCIÓ DEL NOMBRE DE VÍCTIMES SEGONS\nLA GRAVETAT DELS ACCIDENTS', fontsize = 25, pad = 80)
plt.legend(title='Accident Year',loc='upper right', bbox_to_anchor=(0.75, 1.18), fontsize = 21, title_fontsize = 21, ncol = 2)
#plt.savefig('violin3_RespCasualtiesYear.png', bbox_inches='tight')
plt.show()

dades['Number_of_Vehicles'] = pd.to_numeric(dades['Number_of_Vehicles'], errors='coerce')

plt.figure(figsize=(10, 8))
sns.violinplot(x='Accident_Severity', y='Number_of_Vehicles', data=dades, hue = "Any_Accident", 
               palette=["orange","red"], split=True)
plt.xlabel('Accident Severity', fontsize = 23)
plt.ylabel('Number_of_Vehicles', fontsize = 23)
plt.xticks(fontsize = 23)
plt.yticks(fontsize = 23)
plt.title('DISTRIBUCIÓ DEL NOMBRE DE VEHICLES SEGONS\nLA GRAVETAT DELS ACCIDENTS', fontsize = 25, pad = 80)
plt.legend(title='Accident Year',loc='upper right', bbox_to_anchor=(0.75, 1.18), fontsize = 21, title_fontsize = 21, ncol = 2)
#plt.savefig('violin3_RespNumVehiclesYear.png', bbox_inches='tight')
plt.show()
In [27]:
# Violin plots discretitzats - hue = Number_of_casualties

dades_new['NumberCasualties2'] = dades['Number_of_Casualties'].apply(lambda x: "1" if x == 1 else "Més de 1")

plt.figure(figsize=(10, 8))
sns.violinplot(x='Accident_Severity', y='Speed_limit', data=dades_new, hue='NumberCasualties2', 
               palette = ["orange","red"], split=True)
plt.title('LÍMITS DE VELOCITAT SEGONS LA GRAVETAT DELS\nACCIDENTS PER NOMBRE DE VÍCTIMES', fontsize = 25, pad = 85)
plt.xlabel('Accident Severity', fontsize = 23)
plt.ylabel('Speed limit', fontsize = 23)
plt.xticks(fontsize = 23)
plt.yticks(fontsize = 23)
plt.gca().invert_yaxis()
plt.legend(title='Number of Casualties',loc='upper center', bbox_to_anchor=(0.5, 1.18), fontsize = 21, title_fontsize = 21, ncol = 2)
#plt.savefig('violin3_RespSpeedCasualt_1.png', bbox_inches='tight')
plt.show()


plt.figure(figsize=(10, 8))
dades_new['Number_of_Vehicles'] = dades_new['Number_of_Vehicles'].replace('4 o més', 4)
sns.violinplot(x='Accident_Severity', y='Number_of_Vehicles', data=dades_new, hue = "NumberCasualties2", 
               palette=["orange","red"], split=True)
plt.xlabel('Accident Severity', fontsize = 23)
plt.ylabel('Number of Vehicles', fontsize = 23)
plt.xticks(fontsize = 23)
plt.yticks(fontsize = 23)
plt.gca().invert_yaxis()
plt.title('NOMBRE DE VEHICLES SEGONS LA GRAVETAT DELS ACCIDENTS\nPER NOMBRE DE VÍCTIMES', fontsize = 25, pad = 85)
plt.legend(title='Number of Casualties',loc='upper right', bbox_to_anchor=(0.75, 1.18), fontsize = 21, title_fontsize = 21, ncol = 2)
#plt.savefig('gqtql_RespNumVehicles_1.png', bbox_inches='tight')
plt.show()
In [28]:
# Series temporals

mesos = {
    1: 31, 2: 28, 3: 31, 4: 30, 5: 31, 6: 30, 7: 31, 8: 31, 9: 30, 10: 31, 11: 30, 12: 31
}

mitjanes_per_mes_total = {}

classes = {
    'Fatal': 'Fatal',
    'Serious': 'Serious',
    'Slight': 'Slight'
}

for i, cond in classes.items():
    df = dades[dades['Accident_Severity'] == cond]
    
    mitjanes_per_mes = []
    
    for any_accident in [2021, 2022]:
        df_any = df[df['Any_Accident'] == any_accident]
        
        for mes in range(1, 13):
            df_mes = df_any[df_any['Mes_Accident'] == mes]
            
            num_dies = mesos[mes]
            mitjana_casos = len(df_mes) / num_dies
            
            mitjanes_per_mes.append(mitjana_casos)
            
    mitjanes_per_mes_total[i] = mitjanes_per_mes
    
mitjanes_per_mes_fatal = mitjanes_per_mes_total['Fatal']
mitjanes_per_mes_serious = mitjanes_per_mes_total['Serious']
mitjanes_per_mes_slight = mitjanes_per_mes_total['Slight']
In [30]:
# FATAL
fatal_data = dades[dades['Accident_Severity'] == 'Fatal']
monthly_means = pd.DataFrame(mitjanes_per_mes_fatal, columns=['Mean Fatal Accidents'])

monthly_index = pd.date_range(start='2021-01-01', periods=len(monthly_means), freq='M')
monthly_means.index = monthly_index - pd.offsets.MonthEnd()

daily_counts = fatal_data.groupby('Accident Date').size()

plt.figure(figsize=(12, 6))
daily_counts.plot(kind='line', marker='', color='gray', label='Fatal Accidents per Day')
plt.plot(monthly_means.index, monthly_means['Mean Fatal Accidents'], marker='o', color='red', 
         linestyle='-', label='Monthly Mean Fatal Accidents')
first_day_of_each_month = [pd.Timestamp(year, month, 1) for year in range(2021, 2023) for month in range(1, 13)]

plt.xticks(first_day_of_each_month, rotation=90, fontsize = 14)
plt.yticks(fontsize = 13)
plt.xlabel('')
plt.ylabel('Accidents Fatals', fontsize = 15)
plt.title('EVOLUCIÓ DIÀRIA D\'ACCIDENTS FATALS AMB MITJANA MENSUAL', fontsize=17)
plt.legend(loc='upper right', fontsize = 14, title_fontsize = 14)
plt.ylim(0,20)
plt.savefig('gqtql_TendenciaFatal_1.png', bbox_inches='tight')

plt.show()

# SERIOUS
serious_data = dades[dades['Accident_Severity'] == 'Serious']
monthly_means = pd.DataFrame(mitjanes_per_mes_serious, columns=['Mean Serious Accidents'])

monthly_index = pd.date_range(start='2021-01-01', periods=len(monthly_means), freq='M')
monthly_means.index = monthly_index - pd.offsets.MonthEnd()

daily_counts = serious_data.groupby('Accident Date').size()

plt.figure(figsize=(12, 6))
daily_counts.plot(kind='line', marker='', color='gray', label='Serious Accidents per Day')
plt.plot(monthly_means.index, monthly_means['Mean Serious Accidents'], marker='o', color='darkorange', 
         linestyle='-', label='Monthly Mean Serious Accidents')

first_day_of_each_month = [pd.Timestamp(year, month, 1) for year in range(2021, 2023) for month in range(1, 13)]

plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m-%d'))
plt.xticks(first_day_of_each_month, rotation=90, fontsize = 14)
plt.yticks(fontsize = 14)

plt.xlabel('')
plt.ylabel('Accidents Greus', fontsize = 15)
plt.title('EVOLUCIÓ DIÀRIA D\'ACCIDENTS GREUS AMB MITJANA MENSUAL', fontsize=17)
plt.legend(loc='upper right', fontsize = 14, title_fontsize = 14)


extended_start_date = monthly_index[0] - pd.DateOffset(months=2)
extended_end_date = monthly_index[-1] + pd.DateOffset(months=1)
plt.xlim(extended_start_date, extended_end_date) 
plt.ylim(0,100)
plt.savefig('gqtql_TendenciaSerious_1.png', bbox_inches='tight')

plt.show()

# SLIGHT
slight_data = dades[dades['Accident_Severity'] == 'Slight']
monthly_means = pd.DataFrame(mitjanes_per_mes_slight, columns=['Mean Slight Accidents'])

monthly_index = pd.date_range(start='2021-01-01', periods=len(monthly_means), freq='M')
monthly_means.index = monthly_index - pd.offsets.MonthEnd()

daily_counts = slight_data.groupby('Accident Date').size()

plt.figure(figsize=(12, 6))
daily_counts.plot(kind='line', marker='', color='gray', label='Slight Accidents per Day')
plt.plot(monthly_means.index, monthly_means['Mean Slight Accidents'], marker='o', color='green', 
         linestyle='-', label='Monthly Mean Slight Accidents')
first_day_of_each_month = [pd.Timestamp(year, month, 1) for year in range(2021, 2023) for month in range(1, 13)]

plt.xticks(first_day_of_each_month, [date.strftime('%Y-%m-%d') for date in first_day_of_each_month], rotation=90, fontsize = 14)
plt.yticks(fontsize = 14)

plt.xlabel('')
plt.ylabel('Accidents Lleus', fontsize = 15)
plt.title('EVOLUCIÓ DIÀRIA D\'ACCIDENTS LLEUS AMB MITJANA MENSUAL', fontsize=17)
plt.legend(loc='upper right', fontsize = 14, title_fontsize = 14)

extended_start_date = monthly_index[0] - pd.DateOffset(months=2)
extended_end_date = monthly_index[-1] + pd.DateOffset(months=1)
plt.xlim(extended_start_date, extended_end_date) 
plt.ylim(0,650)
plt.savefig('gqtql_TendenciaSlight_1.png', bbox_inches='tight')

plt.show()
In [31]:
# (Fatal + Serious + Slight)
daily_counts = dades.groupby([dades['Accident Date'].dt.year, dades['Accident Date'].dt.month, dades['Accident Date'].dt.day, 'Accident_Severity']).size().unstack()
plt.figure(figsize=(12, 6))
daily_counts.plot(kind='line', marker='', color=['red', 'orange', 'green'])

plt.xlabel('Data')
plt.ylabel('Accidents')
plt.title('EVOLUCIÓ DE LA GRAVETAT DELS ACCIDENTS AL LARG DEL TEMPS')
plt.legend(title='Accident Severity', loc='upper right', bbox_to_anchor=(1.4, 1))
plt.xticks(rotation=90, fontsize = 9)
plt.yticks(fontsize = 9)

plt.show()
<Figure size 1200x600 with 0 Axes>

3.2 ANÀLISI DESCRIPTIVA QUALITATIVA¶

In [33]:
# Univariant

# Accident_Severity 
frequencia_accident_sev = dades['Accident_Severity'].value_counts().sort_index()
colors = ['red','orange','green']
start_angles = 40
plt.pie(frequencia_accident_sev.values, labels=frequencia_accident_sev.index, autopct='%1.1f%%',
        startangle=start_angles, textprops={'fontsize': 12, 'color': 'white'}, labeldistance=None, colors=colors)
plt.title('FREQÜÈNCIA DE LA GRAVETAT DELS ACCIDENTS', fontsize = 15)
plt.legend(loc='upper right', bbox_to_anchor=(1.2, 1), prop={'size': 12})
#plt.savefig('gq_resp_1')
plt.show()

# Weather_Conditions
frequencia_weather = dades['Weather_Conditions'].value_counts().sort_index()
plt.figure(figsize=(8, 6))
plt.bar(frequencia_weather.index, frequencia_weather.values, color = '#008000', width = 0.9)
plt.xlabel('Weather Conditions', fontsize = 11)
plt.ylabel('Freqüència', fontsize = 11)
plt.title('FREQÜÈNCIA D\'ACCIDENTS PER LES DIFERENTS\nCONDICIONS CLIMÀTIQUES', fontsize = 15)
plt.xticks(rotation=60, fontsize=11)
plt.yticks(fontsize = 11)
for i, v in zip(frequencia_weather.index, frequencia_weather.values):
    plt.text(i , v + 0.1, str(v), ha='center', va='bottom', color='black', fontsize=8)
#plt.savefig('gq_weather_1.png', bbox_inches='tight')
plt.show()

# Light_Conditions
frequencia_light = dades['Light_Conditions'].value_counts().sort_index()
plt.figure(figsize=(8, 6))
plt.bar(frequencia_light.index, frequencia_light.values, color = '#008000', width = 0.9)
plt.xlabel('Light Conditions', fontsize = 11)
plt.ylabel('Freqüència', fontsize = 11)
plt.title('FREQÜÈNCIA D\'ACCIDENTS PER LES DIFERENTS\nCONDICIONS DE LLUM', fontsize = 15)
plt.xticks(rotation=0, fontsize = 8)
plt.yticks(fontsize = 11)
for i, v in zip(frequencia_light.index, frequencia_light.values):
    plt.text(i , v + 0.1, str(v), ha='center', va='bottom', color='black', fontsize=8)
#plt.savefig('gq_light_1')
plt.show()

# Local_Authority_(District)
top_20_districts = dades['Local_Authority_(District)'].value_counts().nlargest(20)

percentatges = (top_20_districts / top_20_districts.sum()) * 100

plt.figure(figsize=(8, 6))
bars = plt.bar(top_20_districts.index, percentatges, color='#008000', width=0.9)
plt.xlabel('20 principals autoritats locals', fontsize = 11)
plt.ylabel('Percentatge (%)')
plt.title('DISTRIBUCIÓ PERCENTUAL DE LES 20 PRINCIPALS\nENTITATS LOCALS')
plt.xticks(rotation=60, fontsize=8)
for bar, percentage in zip(bars, percentatges):
    yval = bar.get_height()
    plt.text(bar.get_x() + bar.get_width() / 2, yval + 0.1, 
             f'{percentage:.1f}%', ha='center', va='bottom', color='black', fontsize=7)
#plt.savefig('gq_district_1')
plt.show()

# Vehicle_Type
frequencia_vehicletype = dades['Vehicle_Type'].value_counts().sort_index()
plt.figure(figsize=(8, 6))
plt.bar(frequencia_vehicletype.index, frequencia_vehicletype.values, color = '#008000', width = 0.9)
plt.xlabel('Vehicle Type')
plt.ylabel('Freqüència')
plt.title('FREQÜÈNCIA D\'ACCIDENTS PELS DIFERENTS TIPUS\nDE VEHICLES', fontsize = 15)
plt.xticks(rotation=90, fontsize=8)
for i, v in zip(frequencia_vehicletype.index, frequencia_vehicletype.values):
    plt.text(i , v + 0.1, str(v), ha='center', va='bottom', color='black', fontsize=7)
#plt.savefig('gq_vehicle_1')
plt.show()
In [34]:
# Bivariant

# Accident_Severity i Day_of_Week
cross_table = pd.crosstab(dades['Day_of_Week'], dades['Accident_Severity'])
order_days = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
cross_table = cross_table.reindex(order_days)
cross_table = cross_table.div(cross_table.sum(axis=1), axis=0)

fig, ax = plt.subplots(figsize=(10, 6))
cross_table.plot(kind='bar', stacked=True, color=['red', 'orange', 'green'], ax=ax)

plt.legend(title='Accident Severity', bbox_to_anchor=(1.15, 0.6), loc='upper left', fontsize = 15, title_fontsize = 15)
plt.xlabel('Day of Week', fontsize=15)
plt.xticks(rotation = 0, fontsize=15)
plt.yticks(fontsize = 15)
plt.title('PROPORCIÓ DE LA GRAVETAT DELS ACCIDENTS\nI FREQÜÈNCIA PER DIA', fontsize = 17)

ax2 = ax.twinx()
frequencies = dades['Day_of_Week'].value_counts().reindex(order_days)
ax2.plot(frequencies.index, frequencies.values, color='black', linestyle='-', marker='o', label='Freqüència d\'accidents')
ax2.tick_params(axis='y', labelsize=15)
ax2.legend(loc='upper right', bbox_to_anchor=(1.55, 0.7), fontsize = 15)

#plt.savefig('gqq_RespDay_2.png', bbox_inches='tight')
plt.show()

# Accident_Severity i Road_Surface_Conditions
cross_table = pd.crosstab(dades['Road_Surface_Conditions'], dades['Accident_Severity'])
cross_table = cross_table.div(cross_table.sum(axis=1), axis=0)

frequencies = dades['Road_Surface_Conditions'].value_counts()
frequencies = frequencies.reindex(cross_table.index) 

fig, ax = plt.subplots(figsize=(10, 6))
cross_table.plot(kind='bar', stacked=True, color=['red', 'orange', 'green'], ax=ax)

plt.legend(title='Accident Severity', bbox_to_anchor=(1.2, 0.6), loc='upper left', fontsize = 15, title_fontsize = 15)
plt.xlabel('Road Surface Conditions', fontsize=14)
plt.xticks(ticks=[0, 1, 2, 3, 4], labels=['Dry', 'Flood over 3cm', 'Frost or ice', 'Snow', 'Wet or damp'], 
           fontsize=14, rotation = 0)
plt.yticks(fontsize = 14)
plt.title('PROPORCIÓ DE LA GRAVETAT DELS ACCIDENTS I\nFREQÜÈNCIA PER CONDICIONS DE SUPERFÍCIE', fontsize=17)

ax2 = ax.twinx()
ax2.plot(frequencies.index, frequencies.values, color='black', linestyle='-', marker='o', label='Freqüència d\'accidents')
ax2.tick_params(axis='y', labelsize=14)
ax2.legend(loc='upper right', bbox_to_anchor=(1.5, 0.7), fontsize = 15)

#plt.savefig('gqq_RespSurface_2.png', bbox_inches='tight')
plt.show()

# Accident_Severity i Weather_Conditions
cross_table = pd.crosstab(dades['Weather_Conditions'], dades['Accident_Severity'])
cross_table = cross_table.div(cross_table.sum(axis=1), axis=0)

frequencies = dades['Weather_Conditions'].value_counts()
frequencies = frequencies.reindex(cross_table.index)

fig, ax = plt.subplots(figsize=(10, 6))
cross_table.plot(kind='bar', stacked=True, color=['red', 'orange', 'green'], ax=ax)

plt.legend(title='Accident Severity', bbox_to_anchor=(1.15, 0.6), loc='upper left', fontsize = 14, title_fontsize = 14)
plt.xlabel('Weather Conditions', fontsize=14)
plt.xticks(fontsize=12)
plt.title('PROPORCIÓ DE LA GRAVETAT DELS ACCIDENTS I\nFREQÜÈNCIA PER CONDICIONS CLIMÀTIQUES', fontsize = 15)

ax2 = ax.twinx()
ax2.plot(frequencies.index, frequencies.values, color='black', linestyle='-', marker='o', label='Freqüència d\'accidents')
ax2.legend(loc='upper right', bbox_to_anchor=(1.5, 0.7), fontsize = 14)

#plt.savefig('gqq_RespWeather_2.png', bbox_inches='tight')
plt.show()

# Accident_Severity i Urban or Rural Area
cross_table = pd.crosstab(dades['Urban_or_Rural_Area'], dades['Accident_Severity'])
cross_table = cross_table.div(cross_table.sum(axis=1), axis=0)

frequencies = dades['Urban_or_Rural_Area'].value_counts()
frequencies = frequencies.reindex(cross_table.index)

fig, ax = plt.subplots(figsize = (10, 6))
cross_table.plot(kind = 'bar', stacked = True, color = ['red', 'orange', 'green'], ax = ax)

plt.legend(title='Accident Severity', bbox_to_anchor=(1.15, 0.6), loc='upper left', fontsize = 15, title_fontsize = 15)
plt.xlabel('Urban_or_Rural_Area', fontsize = 15)
plt.xticks(rotation = 0, fontsize = 15)
plt.yticks(fontsize = 15)
plt.title('PROPORCIÓ DE LA GRAVETAT DELS ACCIDENTS I\nFREQÜÈNCIA PER ÀREES', fontsize = 17)

ax2 = ax.twinx()
ax2.plot(frequencies.index, frequencies.values, color='black', linestyle='-', marker='o', label='Freqüència d\'accidents')
ax2.tick_params(axis='y', labelsize=15)
ax2.legend(loc='upper right', bbox_to_anchor=(1.5, 0.7), fontsize = 15)

#plt.savefig('gqq_RespUrbRur_2.png', bbox_inches='tight')
plt.show()

# Accident_Severity i Road Type
cross_table = pd.crosstab(dades['Road_Type'], dades['Accident_Severity'])
cross_table = cross_table.div(cross_table.sum(axis=1), axis=0)

frequencies = dades['Road_Type'].value_counts()
frequencies = frequencies.reindex(cross_table.index)

fig, ax = plt.subplots(figsize=(10, 6))
cross_table.plot(kind='bar', stacked=True, color=['red', 'orange', 'green'], ax=ax)

plt.legend(title='Accident Severity', bbox_to_anchor=(1.15, 0.6), loc='upper left', fontsize = 14, title_fontsize = 14)
plt.xlabel('Road_Type', fontsize=14)
plt.xticks(rotation = 0, fontsize=12)
plt.title('PROPORCIÓ DE LA GRAVETAT DELS ACCIDENTS I\nFREQÜÈNCIA PER TIPUS DE CARRETERA', fontsize = 15)

ax2 = ax.twinx()
ax2.plot(frequencies.index, frequencies.values, color='black', linestyle='-', marker='o', label='Freqüència d\'accidents')
ax2.legend(loc='upper right', bbox_to_anchor=(1.5, 0.7), fontsize = 14)

#plt.savefig('gqq_RespRoad_2.png', bbox_inches='tight')
plt.show()

# Accident_Severity i Vehicle Type
cross_table = pd.crosstab(dades['Vehicle_Type'], dades['Accident_Severity'])
cross_table = cross_table.div(cross_table.sum(axis=1), axis=0)

frequencies = dades['Vehicle_Type'].value_counts()
frequencies = frequencies.reindex(cross_table.index)

fig, ax = plt.subplots(figsize=(10, 6))
cross_table.plot(kind='bar', stacked=True, color=['red', 'orange', 'green'], ax=ax)

plt.legend(title='Accident Severity', bbox_to_anchor=(1.15, 0.6), loc='upper left', fontsize = 14, title_fontsize = 14)
plt.xlabel('Vehicle_Type', fontsize=14)
plt.xticks(fontsize=12)
plt.title('PROPORCIÓ DE LA GRAVETAT DELS ACCIDENTS I\nFREQÜÈNCIA PER TIPUS DE VEHÍCLE', fontsize = 15)

ax2 = ax.twinx()
ax2.plot(frequencies.index, frequencies.values, color='black', linestyle='-', marker='o', label='Freqüència d\'accidents')
ax2.legend(loc='upper right', bbox_to_anchor=(1.5, 0.7), fontsize = 14)

#plt.savefig('gqq_RespVehicle_2.png', bbox_inches='tight')
plt.show()

# Accident_Severity i Junction_Control
cross_table = pd.crosstab(dades['Junction_Control'], dades['Accident_Severity'])
cross_table = cross_table.div(cross_table.sum(axis=1), axis=0)

frequencies = dades['Junction_Control'].value_counts()
frequencies = frequencies.reindex(cross_table.index)

fig, ax = plt.subplots(figsize=(10, 6))
cross_table.plot(kind='bar', stacked=True, color=['red', 'orange', 'green'], ax=ax)

plt.legend(title='Accident Severity', bbox_to_anchor=(1.15, 0.6), loc='upper left', fontsize = 14, title_fontsize = 14)
plt.xlabel('Junction Control', fontsize=14)
plt.xticks(fontsize=12)
plt.title('PROPORCIÓ DE LA GRAVETAT DELS ACCIDENTS I\nFREQÜÈNCIA PER TIPUS DE REGULACIÓ EN L\'ENCREUAMENT', fontsize = 15)

ax2 = ax.twinx()
ax2.plot(frequencies.index, frequencies.values, color='black', linestyle='-', marker='o', label='Freqüència d\'accidents')
ax2.legend(loc='upper right', bbox_to_anchor=(1.5, 0.7), fontsize = 14)

#plt.savefig('gqq_RespJControl_2.png', bbox_inches='tight')
plt.show()


# Accident_Severity i Junction_Detail
cross_table = pd.crosstab(dades['Junction_Detail'], dades['Accident_Severity'])
cross_table = cross_table.div(cross_table.sum(axis=1), axis=0)

frequencies = dades['Junction_Detail'].value_counts()
frequencies = frequencies.reindex(cross_table.index)

fig, ax = plt.subplots(figsize=(10, 6))
cross_table.plot(kind='bar', stacked=True, color=['red', 'orange', 'green'], ax=ax)

plt.legend(title='Accident Severity', bbox_to_anchor=(1.15, 0.6), loc='upper left', fontsize = 14, title_fontsize = 14)
plt.xlabel('Junction Detail', fontsize=14)
plt.xticks(fontsize=12)
plt.title('PROPORCIÓ DE LA GRAVETAT DELS ACCIDENTS I\nFREQÜÈNCIA PER TIPUS D\'ENCREUAMENT', fontsize = 15)

ax2 = ax.twinx()
ax2.plot(frequencies.index, frequencies.values, color='black', linestyle='-', marker='o', label='Freqüència d\'accidents')
ax2.legend(loc='upper right', bbox_to_anchor=(1.5, 0.7), fontsize = 14)

#plt.savefig('gqq_RespJDetail_2.png', bbox_inches='tight')
plt.show()


# Accident_Severity i Light_Conditions
cross_table = pd.crosstab(dades['Light_Conditions'], dades['Accident_Severity'])
cross_table = cross_table.div(cross_table.sum(axis=1), axis=0)

frequencies = dades['Light_Conditions'].value_counts()
frequencies = frequencies.reindex(cross_table.index)

fig, ax = plt.subplots(figsize=(10, 6))
cross_table.plot(kind='bar', stacked=True, color=['red', 'orange', 'green'], ax=ax)

plt.legend(title='Accident Severity', bbox_to_anchor=(1.15, 0.6), loc='upper left', fontsize = 14, title_fontsize = 14)
plt.xlabel('Light Conditions', fontsize = 13)
plt.xticks(rotation = 0, fontsize=11)
plt.yticks(fontsize = 13)
plt.title('PROPORCIÓ DE LA GRAVETAT DELS ACCIDENTS I\nFREQÜÈNCIA PER DIFERENTS CONDICIONS DE LLUM', fontsize = 17)

ax2 = ax.twinx()
ax2.plot(frequencies.index, frequencies.values, color='black', linestyle='-', marker='o', label='Freqüència d\'accidents')
ax2.tick_params(axis='y', labelsize=13)
ax2.legend(loc='upper right', bbox_to_anchor=(1.5, 0.7), fontsize = 14)

#plt.savefig('gqq_RespLight_2.png', bbox_inches='tight')
plt.show()


# Accident_Severity i Carriageway_Hazards
cross_table = pd.crosstab(dades['Carriageway_Hazards'], dades['Accident_Severity'])
cross_table = cross_table.div(cross_table.sum(axis=1), axis=0)

frequencies = dades['Carriageway_Hazards'].value_counts()
frequencies = frequencies.reindex(cross_table.index)

fig, ax = plt.subplots(figsize=(10, 6))
cross_table.plot(kind='bar', stacked=True, color=['red', 'orange', 'green'], ax=ax)

plt.legend(title='Accident Severity', bbox_to_anchor=(1.15, 0.6), loc='upper left', fontsize = 14, title_fontsize = 14)
plt.xlabel('Carriageway Hazards', fontsize=14)
plt.xticks(fontsize=12)
plt.title('PROPORCIÓ DE LA GRAVETAT DELS ACCIDENTS I\nFREQÜÈNCIA PER DIFERENTS PERILLS EN LA CALÇADA', fontsize = 15)

ax2 = ax.twinx()
ax2.plot(frequencies.index, frequencies.values, color='black', linestyle='-', marker='o', label='Freqüència d\'accidents')
ax2.legend(loc='upper right', bbox_to_anchor=(1.5, 0.7), fontsize = 14)

#plt.savefig('gqq_RespCarriage_2.png', bbox_inches='tight')
plt.show()


# Accident_Severity i Police_Force
cross_table = pd.crosstab(dades['Police_Force'], dades['Accident_Severity'])
cross_table = cross_table.div(cross_table.sum(axis=1), axis=0)

frequencies = dades['Police_Force'].value_counts()
frequencies = frequencies.reindex(cross_table.index)

fig, ax = plt.subplots(figsize=(10, 6))
cross_table.plot(kind='bar', stacked=True, color=['red', 'orange', 'green'], ax=ax)

plt.legend(title='Accident Severity', bbox_to_anchor=(1.15, 0.6), loc='upper left', fontsize = 14, title_fontsize = 14)
plt.xlabel('Police Force', fontsize=14)
plt.xticks(fontsize=12)
plt.title('PROPORCIÓ DE LA GRAVETAT DELS ACCIDENTS I\nFREQÜÈNCIA PER LES DIFERENTS FORCES POLICIALS', fontsize = 15)

ax2 = ax.twinx()
ax2.plot(frequencies.index, frequencies.values, color='black', linestyle='-', marker='o', label='Freqüència d\'accidents')
ax2.legend(loc='upper right', bbox_to_anchor=(1.5, 0.7), fontsize = 14)

#plt.savefig('gqq_RespPolice_2.png', bbox_inches='tight')
plt.show()
In [35]:
# Freqüències relatives (FATAL)
dades_serious = dades[dades['Accident_Severity'] == 'Fatal']
total_accidents_serious = len(dades_serious)

freq_relatives = dades_serious.groupby(['Latitude', 'Longitude']).size() / total_accidents_serious
freq_relatives = freq_relatives.reset_index(name='Freqüència relativa')

escalem = MinMaxScaler()
freq_relatives[['Freqüència relativa']] = escalem.fit_transform(freq_relatives[['Freqüència relativa']])

df = pd.DataFrame({
    'latitude': freq_relatives['Latitude'],
    'longitude': freq_relatives['Longitude'],
    'F. Rel': freq_relatives['Freqüència relativa']
})

fig = px.density_mapbox(df, lat='latitude', lon='longitude', z='F. Rel',
                        radius=20,
                        center=dict(lat=min(df.latitude), lon=min(df.longitude)),
                        zoom=6,
                        mapbox_style='open-street-map',
                        height=1000,
                        width=1000)

fig.update_layout(
    coloraxis_colorbar=dict(
        title='F. Rel',  
        titlefont=dict(size=35, color='black'),  
        tickfont=dict(size=35, color='black') 
    )
)
fig.show()
In [37]:
# Freqüències relatives (SERIOUS)
import pandas as pd
import plotly.express as px
from sklearn.preprocessing import MinMaxScaler

dades_serious = dades[dades['Accident_Severity'] == 'Serious']
total_accidents_serious = len(dades_serious)

freq_relatives = dades_serious.groupby(['Latitude', 'Longitude']).size() / total_accidents_serious
freq_relatives = freq_relatives.reset_index(name='Freqüència relativa')

escalem = MinMaxScaler()
freq_relatives[['Freqüència relativa']] = escalem.fit_transform(freq_relatives[['Freqüència relativa']])

df = pd.DataFrame({
    'latitude': freq_relatives['Latitude'],
    'longitude': freq_relatives['Longitude'],
    'F. Rel': freq_relatives['Freqüència relativa']
})

fig = px.density_mapbox(df, lat='latitude', lon='longitude', z='F. Rel',
                        radius=12,
                        center=dict(lat=min(df.latitude), lon=min(df.longitude)),
                        zoom=6,
                        mapbox_style='open-street-map',
                        height=1000,
                        width=1000)

fig.update_layout(
    coloraxis_colorbar=dict(
        title='F. Rel',  
        titlefont=dict(size=35, color='black'),  
        tickfont=dict(size=35, color='black') 
    )
)
fig.show()
In [38]:
# Freqüències relatives (SLIGHT)
import pandas as pd
import plotly.express as px
from sklearn.preprocessing import MinMaxScaler

dades_slight = dades[dades['Accident_Severity'] == 'Slight']
total_accidents_slight = len(dades_slight)

freq_relatives = dades_slight.groupby(['Latitude', 'Longitude']).size() / total_accidents_slight
freq_relatives = freq_relatives.reset_index(name='Freqüència relativa')

escalem = MinMaxScaler()
freq_relatives[['Freqüència relativa']] = escalem.fit_transform(freq_relatives[['Freqüència relativa']])

df = pd.DataFrame({
    'latitude': freq_relatives['Latitude'],
    'longitude': freq_relatives['Longitude'],
    'F. Rel': freq_relatives['Freqüència relativa']
})

fig = px.density_mapbox(df, lat='latitude', lon='longitude', z='F. Rel',
                        radius=12,
                        center=dict(lat=min(df.latitude), lon=min(df.longitude)),
                        zoom=6,
                        mapbox_style='open-street-map',
                        height=1000,
                        width=1000)

fig.update_layout(
    coloraxis_colorbar=dict(
        title='F. Rel',  
        titlefont=dict(size=35, color='black'),  
        tickfont=dict(size=35, color='black') 
    )
)
fig.show()

Correlacions i associacions entre variables:

In [39]:
# Matriu de correlacions (Variables numèriques)
cmap = sns.diverging_palette(10, 150, s=100, l=45, sep=5, as_cmap=True,)

variables_quantitatives = ['Latitude', 'Longitude','Number_of_Casualties', 'Number_of_Vehicles', 'Speed_limit']
matriu_correlacions = dades[variables_quantitatives].corr()

plt.figure(figsize=(12, 10))
cmap_custom = sns.color_palette("Greens", as_cmap=True)
heatmap = sns.heatmap(matriu_correlacions, annot=True, cmap=cmap, fmt=".2f", linewidths=0.5, center=0,  
                      annot_kws={"fontsize":12})

for _, spine in heatmap.spines.items():
    spine.set_visible(True)
    spine.set_color('white')

heatmap.set_title('MATRIU DE CORRELACIONS', fontsize = 15)
plt.xticks(rotation=90, fontsize=11)
plt.yticks(rotation=0, fontsize=11)

#plt.savefig('CorrNumeriques.png', bbox_inches='tight')
plt.show()
In [40]:
# TRIANGLE SUPERIOR - altres tonalitats
cmap = sns.diverging_palette(10, 150, s=100, l=45, sep=5, as_cmap=True,)

variables_quantitatives = ['Latitude', 'Longitude','Number_of_Casualties', 'Number_of_Vehicles', 'Speed_limit', 'Any_Accident']
matriu_correlacions = dades[variables_quantitatives].corr()
triangle_superior_correlacions = matriu_correlacions.where(np.triu(np.ones(matriu_correlacions.shape), k = 0).astype(bool))

plt.figure(figsize=(12, 10))
cmap_custom = sns.color_palette("RdYlGn", as_cmap=True)
heatmap = sns.heatmap(triangle_superior_correlacions, annot = True, cmap = cmap_custom, fmt=".2f", linewidths = 0.5, 
                      center = 0,  annot_kws = {"fontsize":20})

for _, spine in heatmap.spines.items():
    spine.set_visible(True)
    spine.set_color('white')

heatmap.set_title('MATRIU DE CORRELACIONS', fontsize = 24)
plt.xticks(rotation = 90, fontsize = 21)
plt.yticks(rotation = 0, fontsize = 21)
plt.savefig('CorrNumeriques2.png', bbox_inches='tight')
plt.show()
In [42]:
# V Cramer - Associacions entre variables
dades_qualitatives = dades[["Day_of_Week", "Junction_Control", "Junction_Detail", "Accident_Severity", "Light_Conditions", 
                            "Carriageway_Hazards", "Police_Force", "Road_Surface_Conditions", "Road_Type", 
                           "Urban_or_Rural_Area","Weather_Conditions", "Vehicle_Type","Local_Authority_(District)"]]

def cramers_v(x, y):
    contingency_table = pd.crosstab(x, y)
    chi2, _, _, _ = chi2_contingency(contingency_table)
    n = contingency_table.sum().sum()
    k = max(contingency_table.shape)
    return np.sqrt(chi2 / (n * (k - 1)))

n_variables = len(dades_qualitatives.columns)
cramer_matrix = np.eye(n_variables)

for i in range(n_variables):
    for j in range(i + 1, n_variables):
        v_cramer = cramers_v(dades_qualitatives.iloc[:, i], dades_qualitatives.iloc[:, j])
        cramer_matrix[i, j] = v_cramer
        cramer_matrix[j, i] = v_cramer  

#cmap_custom = sns.color_palette("Greens", as_cmap=True)
cmap_custom = sns.diverging_palette(10, 150, as_cmap=True, s=100, l=45, sep=5)

plt.figure(figsize=(10, 8))
sns.heatmap(cramer_matrix, annot=True, cmap=cmap_custom, center=0, fmt=".2f", linewidths=0.5, vmin=0, vmax=1)

plt.title('V DE CRAMER - ASSOCIACIÓ ENTRE VARIABLES CATEGÒRIQUES', fontsize = 15)
plt.xticks(np.arange(n_variables) + 0.5, dades_qualitatives.columns, rotation=90, fontsize=11)
plt.yticks(np.arange(n_variables) + 0.5, dades_qualitatives.columns, rotation=0, fontsize=11)

#plt.savefig('AssociCategoriques.png', bbox_inches='tight')
plt.show()
In [44]:
# TRIANGLE SUPERIOR - altres tonalitats
dades_qualitatives = dades[["Day_of_Week", "Junction_Control", "Junction_Detail", "Accident_Severity", "Light_Conditions", 
                            "Carriageway_Hazards", "Police_Force", "Road_Surface_Conditions", "Road_Type", 
                           "Urban_or_Rural_Area","Weather_Conditions", "Vehicle_Type","Local_Authority_(District)"]]

def cramers_v(x, y):
    contingency_table = pd.crosstab(x, y)
    chi2, _, _, _ = chi2_contingency(contingency_table)
    n = contingency_table.sum().sum()
    k = max(contingency_table.shape)
    return np.sqrt(chi2 / (n * (k - 1)))

n_variables = len(dades_qualitatives.columns)
matriu_cramer = np.eye(n_variables)

for i in range(n_variables):
    for j in range(i + 1, n_variables):
        v_cramer = cramers_v(dades_qualitatives.iloc[:, i], dades_qualitatives.iloc[:, j])
        matriu_cramer[i, j] = v_cramer
        matriu_cramer[j, i] = v_cramer  

cmap_custom = sns.color_palette("RdYlGn", as_cmap=True)

plt.figure(figsize=(10, 8))
sns.heatmap(triangle_superior_vcramer, annot=True, cmap=cmap_custom, center=0, fmt=".2f", linewidths=0.5, vmin=0, vmax=1)

plt.title('V DE CRAMER - ASSOCIACIÓ ENTRE VARIABLES CATEGÒRIQUES', fontsize = 17)
plt.xticks(np.arange(n_variables) + 0.5, dades_qualitatives.columns, rotation=90, fontsize=14)
plt.yticks(np.arange(n_variables) + 0.5, dades_qualitatives.columns, rotation=0, fontsize=14)
plt.savefig('AssociCategoriques2.png', bbox_inches='tight')
plt.show()

4. PASSOS PREVIS AL MODELATGE¶

4.1 TRACTAMENT DELS VALORS PERDUTS (NA)¶

Variables amb NA

  • Carriageway_Hazards
  • Road_Surface_Conditions
  • Road_Type
  • Time
  • Wheather_Conditions
In [45]:
# Time - imputació
## darkness - no lighting
from statistics import mode
darknessnolight = dades[dades['Light_Conditions'] == "Darkness - no lighting"].copy()
darknessnolight['Time'] = pd.to_datetime(darknessnolight['Time'], format='%H:%M')
mode_temps_dark_nolight = mode(darknessnolight['Time'])
mode_temps_dark_nolight = mode_temps_dark_nolight.strftime('%H:%M')

filtre1 = (pd.isnull(dades['Time'])) & (dades['Light_Conditions'] == 'Darkness - no lighting')
dades.loc[filtre1, 'Time'] = mode_temps_dark_nolight

## daylight
daylight = dades[dades['Light_Conditions'] == "Daylight"].copy()
daylight['Time'] = pd.to_datetime(daylight['Time'], format='%H:%M')
mediana_temps_daylight = daylight['Time'].mean()
mediana_temps_daylight = mediana_temps_daylight.strftime('%H:%M')

filtre2 = (pd.isnull(dades['Time'])) & (dades['Light_Conditions'] == 'Daylight')
dades.loc[filtre2, 'Time'] = mediana_temps_daylight

## darkness- lights lit
darknesslightslit = dades[dades['Light_Conditions'] == "Darkness - lights lit"].copy()
darknesslightslit['Time'] = pd.to_datetime(darknesslightslit['Time'], format='%H:%M')
mode_temps_darknesslightslit = mode(darknesslightslit['Time'])
mode_temps_darknesslightslit = mode_temps_darknesslightslit.strftime('%H:%M')

filtre3 = (pd.isnull(dades['Time'])) & (dades['Light_Conditions'] == 'Darkness - lights lit')
dades.loc[filtre3, 'Time'] = mode_temps_darknesslightslit
In [46]:
# Creem nova variable - Moment_Dia
dades['Time'] = pd.to_datetime(dades['Time'])

filtre1 = (dades['Time'].dt.hour >= 6) & (dades['Time'].dt.hour <= 11) & (dades['Time'].dt.minute <= 59)
filtre2 = (dades['Time'].dt.hour >= 12) & (dades['Time'].dt.hour <= 14) & (dades['Time'].dt.minute <= 59)
filtre3 = (dades['Time'].dt.hour >= 15) & (dades['Time'].dt.hour <= 19) & (dades['Time'].dt.minute <= 59)
filtre4 = (dades['Time'].dt.hour >= 20) & (dades['Time'].dt.hour <= 23) & (dades['Time'].dt.minute <= 59)
filtre5 = (dades['Time'].dt.hour >= 0) & (dades['Time'].dt.hour <= 5) & (dades['Time'].dt.minute <= 59)

dades.loc[filtre1, 'Moment_Dia'] = 'Matí'
dades.loc[filtre2, 'Moment_Dia'] = 'Migdia'
dades.loc[filtre3, 'Moment_Dia'] = 'Tarda'
dades.loc[filtre4, 'Moment_Dia'] = 'Nit'
dades.loc[filtre5, 'Moment_Dia'] = 'Matinada'

categorias = ['Matí', 'Migdia', 'Tarda', 'Nit', 'Matinada']
moment_dia_dtype = pd.CategoricalDtype(categories=categorias)

dades['Moment_Dia'] = dades['Moment_Dia'].astype(moment_dia_dtype)

# Eliminem variable Time
dades = dades.drop(columns = ['Time','Accident Date'])
In [47]:
# Carriage_Hazards - Eliminem (3 NA)
dades = dades.dropna(subset = ['Carriageway_Hazards'])
In [48]:
# DUMMIES
# dummies variables amb pocs nivells
dades_dummies = dades.drop(['Local_Authority_(District)','Police_Force'],axis=1)
dades_dummies = pd.get_dummies(dades_dummies, dummy_na=False)

# dummies variable Local Authority District
dades_dummies1 = dades.loc[:, ["Local_Authority_(District)"]]
frequencies_LocalAuthority = dades['Local_Authority_(District)'].value_counts()
x = 2500
classes_menys_frequents1 =  frequencies_LocalAuthority[frequencies_LocalAuthority < x].index
dades_dummies1 = dades_dummies1.replace(classes_menys_frequents1, 'Altres')
dades_dummies1 = pd.get_dummies(dades_dummies1)

# dummies variable Police Force
dades_dummies2 = dades.loc[:, ["Police_Force"]]
frequencies_PoliceForce = dades['Police_Force'].value_counts()
x = 9500
classes_menys_frequents2 =  frequencies_PoliceForce[frequencies_PoliceForce < x].index
dades_dummies2 = dades_dummies2.replace(classes_menys_frequents2, 'Altres')
dades_dummies2 = pd.get_dummies(dades_dummies2)

dades_dummies = pd.concat([dades_dummies, dades_dummies1], axis = 1)
dades_dummies = pd.concat([dades_dummies, dades_dummies2], axis = 1)
In [50]:
# Transformem 0 en NA (variables "Road_Type", "Road_Surface_Conditions" i "Weather_Conditions")
selected_columns1 = [col for col in dades_dummies.columns if col.startswith("Road_Type")]

for index, row in dades_dummies.iterrows():
    if row[selected_columns1].sum() == 0:
        dades_dummies.loc[index, selected_columns1] = pd.NA
        
selected_columns2 = [col for col in dades_dummies.columns if col.startswith("Road_Surface_Conditions")]

for index, row in dades_dummies.iterrows():
    if row[selected_columns2].sum() == 0:
        dades_dummies.loc[index, selected_columns2] = pd.NA
        
selected_columns3 = [col for col in dades_dummies.columns if col.startswith("Weather_Conditions")]

for index, row in dades_dummies.iterrows():
    if row[selected_columns3].sum() == 0:
        dades_dummies.loc[index, selected_columns3] = pd.NA

#dades_dummies.to_csv('dades_dummies2.csv', index=False)
In [51]:
# Imputació NA
dades_dummies2 = pd.read_csv('dades_dummies2.csv')
imputer = KNNImputer(n_neighbors = 5)  # per defecte: strategy = mean
dades_imputades = pd.DataFrame(imputer.fit_transform(dades_dummies2), columns = dades_dummies2.columns)

#dades_imputades.to_csv('dades_imputades.csv', index=False) 
In [52]:
# Comprovem NA
dades_imputades = pd.read_csv('dades_imputades.csv')
dades_imputades.isna().sum().sum()
Out[52]:
0
In [53]:
# 1: a les variables dummies on hi hagi el màxim, 0: les altres i en cas d'empat aleatòriament
dades_imputades = pd.read_csv('dades_imputades.csv')
categories = ["Road_Type", "Road_Surface_Conditions", "Weather_Conditions"]

for categoria in categories:
    col = dades_imputades.filter(regex=f'^{categoria}', axis=1).columns
    
    for index, row in dades_imputades.iterrows():
        files_val = row[col]
        valor_max = files_val.max()
        cont_max = (files_val == valor_max).sum()
        
        if cont_max > 1:
            indexs_max = files_val[files_val == valor_max].index.tolist()
            index_max = np.random.choice(indexs_max)
        else:
            max_index = files_val.idxmax()
        
        dades_imputades.loc[index, col] = 0
        dades_imputades.at[index, max_index] = 1
In [55]:
# Reconvertim dummies en variables categòriques

warnings.simplefilter(action='ignore', category=FutureWarning) # ignorem warnings


patrons_prefixos = {
    'Day_of_Week': ('^Day_of_Week_', ''),
    'Junction_Control': ('^Junction_Control_', ''),
    'Junction_Detail': ('^Junction_Detail_', ''),
    'Accident_Severity': ('^Accident_Severity_', ''),
    'Light_Conditions': ('^Light_Conditions_', ''),
    'Carriageway_Hazards': ('^Carriageway_Hazards_', ''),
    'Road_Surface_Conditions': ('^Road_Surface_Conditions_', ''),
    'Road_Type': ('^Road_Type_', ''),
    'Urban_or_Rural_Area': ('^Urban_or_Rural_Area_', ''),
    'Weather_Conditions': ('^Weather_Conditions_', ''),
    'Vehicle_Type': ('^Vehicle_Type_', ''),
    'Any_Accident': ('^Any_Accident_', ''),
    'Moment_Dia': ('^Moment_Dia_', ''),
    'Local_Authority_District': ('^Local_Authority_\(District\)_', ''),
    'Police_Force': ('^Police_Force_', '')
    
    
}

dades2 = pd.DataFrame()

for i, (patro, prefix) in patrons_prefixos.items():
    variables = dades_imputades.filter(regex = patro, axis = 1)
    categories = variables.idxmax(axis=1)
    categories = categories.str.replace(patro, "")

    dades2[i] = categories

dades2 = pd.concat([dades2, dades_imputades[['Latitude','Longitude','Number_of_Casualties','Number_of_Vehicles',
                                       'Speed_limit','Mes_Accident','Dia_Accident']]], axis=1)
        
#dades2.to_csv('dades2.csv', index=False)

4.2 SYNTHETIC MINORITY OVERSAMPLING TECHNIQUE (SMOTE)¶

In [56]:
dades2 = pd.read_csv('dades2.csv') 

# Convertim les variables de tipus object a categòriques
for var in dades2.columns:
    if dades2[var].dtype == 'object':
        dades2[var] = dades2[var].astype('category')

Tècnica SMOTE

In [57]:
# Convertim les variables categòriques a Dummy (menys la variable resposta)
no_dummy = dades2['Accident_Severity']
si_dummies = dades2.drop('Accident_Severity', axis=1)  
columnes_dummies = pd.get_dummies(si_dummies)
dades_dummies2 = pd.concat([no_dummy, columnes_dummies], axis=1)

le = LabelEncoder()
dades_dummies2['Accident_Severity'] = le.fit_transform(dades2['Accident_Severity']) #0: Fatal, 1: Serious, 2: Slight

X = dades_dummies2.drop(['Accident_Severity'], axis=1)
y = dades_dummies2['Accident_Severity']

smote = SMOTE(random_state=123)
X_resampled, y_resampled = smote.fit_resample(X, y)

dades_equilibrades = pd.DataFrame(X_resampled, columns=X.columns)
dades_equilibrades['Accident_Severity'] = y_resampled

dades_equilibrades.to_csv('dades_equilibrades.csv', index=False)
dades_equilibrades = pd.read_csv('dades_equilibrades.csv') 
In [58]:
# Accident_Severity - Dades balancejades
frequencia_accident_sev = dades_equilibrades['Accident_Severity'].value_counts().sort_index()
colors = ['red','orange','green']
start_angles = 90
plt.pie(frequencia_accident_sev.values, labels = ['Fatal', 'Serious', 'Slight'], autopct='%1.1f%%',
        startangle=start_angles, textprops={'fontsize': 12, 'color': 'white'}, labeldistance=None, colors=colors)
plt.title('FREQÜÈNCIA DE LA GRAVETAT DELS ACCIDENTS', fontsize = 15)
plt.legend(loc='upper right', bbox_to_anchor=(1.2, 1), prop={'size': 12})
#plt.savefig('gq_resp_2')
plt.show()

4.3 SELECCIÓ DE VARIABLES¶

Per continuar amb la selecció de variables he fet dues proves diferents:

  • Per les variables numèriques: prova ANOVA. Ens retorna les variables numèriques ordenades descendentment segons les seves puntuacions F en ANOVA.
  • Per les variables categòriques: test chi-quadrat. Ens retorna les variables categòriques més rellevants mitjançant la prova chi-quadrat, ens les retorna ordenades descendentment.
  • Mitjançant l'algorisme Random Forest
In [59]:
# variables numèriques (DADES DESBALANCEJADES)
from mlxtend.feature_selection import SequentialFeatureSelector
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif

X = dades2.drop('Accident_Severity', axis=1) 
y = dades2['Accident_Severity'] 

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 123)

X_train_numeric = X_train.select_dtypes(include='number')

k = 3 

selector = SelectKBest(score_func=f_classif, k=k)
selector.fit(X_train_numeric, y_train)

scores = selector.scores_ # prova F
variables_scores = pd.DataFrame({'Variable': X_train_numeric.columns, 'Score': scores}) # creem data frame  
sorted_variables = variables_scores.sort_values(by='Score', ascending=False) # ordenem descendentment
sorted_variables_names = sorted_variables['Variable'].values # obtenim els noms de les variables ordenades descendentment segons F

print(sorted_variables)
               Variable       Score
4    Number_of_Vehicles  949.924485
5           Speed_limit  770.110096
3  Number_of_Casualties  537.718159
1              Latitude   48.236906
0          Any_Accident   36.802173
6          Mes_Accident   22.249343
7          Dia_Accident    3.893721
2             Longitude    0.429082
In [60]:
# variables categòriques (DADES DESBALANCEJADES)

X_train_categorical = X_train.select_dtypes(include=['object', 'category'])
X_train_encoded = pd.get_dummies(X_train_categorical)

k = 4  

selector = SelectKBest(score_func=chi2, k=k)
X_train_selected = selector.fit_transform(X_train_encoded, y_train)

indexs = selector.get_support(indices=True) # obtenim els índexs de les característiques seleccionades
variables = X_train_encoded.columns[indexs] # obtenim els noms de les característiques

chi2_scores = selector.scores_ # obtenim valors prova chi-quadrat
variables_chi2_scores = pd.DataFrame({'Variable': X_train_encoded.columns, 'Chi2 Score': chi2_scores}) # creem df
sorted_variable_chi2_scores = variables_chi2_scores.sort_values(by='Chi2 Score', ascending=False) # ordenem descendentment
sorted_variable_names = sorted_variable_chi2_scores['Variable'].values # ordenem els noms 

pd.set_option('display.max_rows', None)
print(variables_chi2_scores)
                                             Variable  Chi2 Score
0                                  Day_of_Week_Friday   11.150113
1                                  Day_of_Week_Monday   34.939034
2                                Day_of_Week_Saturday  103.185878
3                                  Day_of_Week_Sunday  268.458262
4                                Day_of_Week_Thursday   13.239020
5                                 Day_of_Week_Tuesday   21.985900
6                               Day_of_Week_Wednesday   41.358798
7                  Junction_Control_Authorised person   11.519194
8                Junction_Control_Auto traffic signal  208.674754
9       Junction_Control_Data missing or out of range  515.920393
10          Junction_Control_Give way or uncontrolled  306.003985
11  Junction_Control_Not at junction or within 20 ...  199.337075
12                         Junction_Control_Stop sign   20.960614
13                         Junction_Detail_Crossroads   46.444317
14                    Junction_Detail_Mini-roundabout   61.456488
15  Junction_Detail_More than 4 arms (not roundabout)   16.277993
16  Junction_Detail_Not at junction or within 20 m...  713.587596
17                     Junction_Detail_Other junction   35.345519
18          Junction_Detail_Private drive or entrance   11.365406
19                         Junction_Detail_Roundabout  571.525183
20                          Junction_Detail_Slip road   13.804962
21            Junction_Detail_T or staggered junction   91.216898
22       Light_Conditions_Darkness - lighting unknown    4.297656
23             Light_Conditions_Darkness - lights lit   44.869162
24            Light_Conditions_Darkness - no lighting  907.307683
25                          Light_Conditions_Daylight  126.252162
26  Carriageway_Hazards_Any animal in carriageway ...    8.239916
27                           Carriageway_Hazards_None    0.132327
28           Carriageway_Hazards_Other object on road   16.414357
29  Carriageway_Hazards_Pedestrian in carriageway ...    3.259509
30              Carriageway_Hazards_Previous accident   18.153420
31           Carriageway_Hazards_Vehicle load on road    1.619671
32                        Road_Surface_Conditions_Dry   30.884283
33       Road_Surface_Conditions_Flood over 3cm. deep   13.491140
34               Road_Surface_Conditions_Frost or ice   73.338038
35                       Road_Surface_Conditions_Snow   68.369358
36                Road_Surface_Conditions_Wet or damp   18.680857
37                         Road_Type_Dual carriageway  139.011225
38                           Road_Type_One way street   14.857857
39                               Road_Type_Roundabout  389.136512
40                       Road_Type_Single carriageway  100.024562
41                                Road_Type_Slip road   78.199591
42                          Urban_or_Rural_Area_Rural  914.912800
43                          Urban_or_Rural_Area_Urban  503.987597
44               Weather_Conditions_Fine + high winds    9.275528
45              Weather_Conditions_Fine no high winds   27.161858
46                     Weather_Conditions_Fog or mist    2.558448
47                           Weather_Conditions_Other   51.414316
48            Weather_Conditions_Raining + high winds    1.251197
49           Weather_Conditions_Raining no high winds   51.018101
50            Weather_Conditions_Snowing + high winds    4.876933
51           Weather_Conditions_Snowing no high winds   84.404350
52                  Vehicle_Type_Agricultural vehicle    1.425087
53  Vehicle_Type_Bus or coach (17 or more pass seats)    3.049691
54                                   Vehicle_Type_Car    0.887333
55         Vehicle_Type_Goods 7.5 tonnes mgw and over    5.647425
56       Vehicle_Type_Goods over 3.5t. and under 7.5t    1.442424
57      Vehicle_Type_Minibus (8 - 16 passenger seats)    2.144449
58            Vehicle_Type_Motorcycle 125cc and under    0.676551
59             Vehicle_Type_Motorcycle 50cc and under    0.347218
60  Vehicle_Type_Motorcycle over 125cc and up to 5...    3.320505
61                 Vehicle_Type_Motorcycle over 500cc    0.348431
62                         Vehicle_Type_Other vehicle    0.497951
63                 Vehicle_Type_Taxi/Private hire car    2.629232
64   Vehicle_Type_Van / Goods 3.5 tonnes mgw or under    1.507059
65                                Moment_Dia_Matinada  820.647690
66                                    Moment_Dia_Matí  127.783691
67                                  Moment_Dia_Migdia   43.636929
68                                     Moment_Dia_Nit  178.378937
69                                   Moment_Dia_Tarda   41.604374
70                    Local_Authority_District_Altres    3.469014
71                Local_Authority_District_Birmingham   16.253787
72                  Local_Authority_District_Bradford    5.356357
73                  Local_Authority_District_Cornwall   20.703415
74                     Local_Authority_District_Leeds    5.501108
75                 Local_Authority_District_Liverpool    5.818509
76                Local_Authority_District_Manchester   30.022081
77                 Local_Authority_District_Sheffield   12.019601
78               Local_Authority_District_Westminster  479.398823
79                                Police_Force_Altres   86.477701
80                    Police_Force_Greater Manchester   58.152181
81                                  Police_Force_Kent   51.471520
82                   Police_Force_Metropolitan Police  105.607938
83                         Police_Force_Thames Valley    2.912911
84                         Police_Force_West Midlands   21.086930
85                        Police_Force_West Yorkshire    2.575736
In [61]:
filtre = ["Day_of_Week_","Light_Conditions_","Carriageway_Hazards_","Road_Surface_Conditions_",
          "Road_Type_","Urban_or_Rural_Area_","Weather_Conditions_","Moment_Dia_","Police_Force_",
                              "Junction_Detail_","Junction_Control_","Local_Authority_District_",
                              "Vehicle_Type_"]

chi_quadrat_resultats = {}

for i in filtre:
    filtered_rows = variables_chi2_scores[variables_chi2_scores['Variable'].str.startswith(i)]
    valors = filtered_rows['Chi2 Score'].tolist()
    
    if valors:
        chi_quadrat_resultats[i] = sum(valors) / len(valors)
    else:
        chi_quadrat_resultats[i] = 0


print("Mitjana del valor Chi2 Score per variable:")
for grup, mean in chi_quadrat_resultats.items():
    print(f"'{grup}': {mean}")
        
Mitjana del valor Chi2 Score per variable:
'Day_of_Week_': 70.6167150991861
'Light_Conditions_': 270.68166561505626
'Carriageway_Hazards_': 7.969866854732
'Road_Surface_Conditions_': 40.95273489153306
'Road_Type_': 144.2459494535297
'Urban_or_Rural_Area_': 709.4501987944527
'Weather_Conditions_': 28.99509141985115
'Moment_Dia_': 242.41032405814286
'Police_Force_': 46.89784525351007
'Junction_Detail_': 173.4471511589615
'Junction_Control_': 210.40266896740152
'Local_Authority_District_': 64.28252158885047
'Vehicle_Type_': 1.840258219268028

Mateix procediment però amb les dades balancejades.

In [62]:
# VARIABLES NUMÈRIQUES
## reconvertim dummies en variables categòriques
warnings.simplefilter(action='ignore', category=FutureWarning) # ignorem warnings
patrons_prefixos = {
    'Day_of_Week': ('^Day_of_Week_', ''),
    'Junction_Control': ('^Junction_Control_', ''),
    'Junction_Detail': ('^Junction_Detail_', ''),
    'Light_Conditions': ('^Light_Conditions_', ''),
    'Carriageway_Hazards': ('^Carriageway_Hazards_', ''),
    'Road_Surface_Conditions': ('^Road_Surface_Conditions_', ''),
    'Road_Type': ('^Road_Type_', ''),
    'Urban_or_Rural_Area': ('^Urban_or_Rural_Area_', ''),
    'Vehicle_Type': ('^Vehicle_Type_', ''),
    'Weather_Conditions': ('^Weather_Conditions_', ''),
    'Moment_Dia': ('^Moment_Dia_', ''),
    'Local_Authority_District': ('^Local_Authority_District_', ''),
    'Police_Force': ('^Police_Force_', '')
}

df_eq = pd.DataFrame()

for i, (patro, prefix) in patrons_prefixos.items():
    variables = dades_equilibrades.filter(regex = patro, axis = 1)
    categories = variables.idxmax(axis=1)
    categories = categories.str.replace(patro, "")

    df_eq[i] = categories

dades_equilibrades2 = pd.concat([df_eq, dades_equilibrades[['Latitude','Longitude','Number_of_Casualties',
                                                            'Number_of_Vehicles','Speed_limit','Mes_Accident',
                                                            'Any_Accident','Dia_Accident','Accident_Severity']]], axis=1)
        

X = dades_equilibrades2.drop('Accident_Severity', axis=1) 
y = dades_equilibrades2['Accident_Severity'] 

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 123)

X_train_numeric = X_train.select_dtypes(include='number')

k = 3 

selector = SelectKBest(score_func=f_classif, k=k)
selector.fit(X_train_numeric, y_train)

scores = selector.scores_ # prova F
variables_scores = pd.DataFrame({'Variable': X_train_numeric.columns, 'Score': scores}) 
sorted_variables = variables_scores.sort_values(by='Score', ascending=False) 
sorted_variables_names = sorted_variables['Variable'].values # variables ordenades descendentment segons F

print(sorted_variables)
               Variable         Score
4           Speed_limit  19259.520527
6          Any_Accident  17151.807911
2  Number_of_Casualties   9523.761299
3    Number_of_Vehicles   5242.444710
5          Mes_Accident    792.982801
0              Latitude    178.431324
7          Dia_Accident     19.493244
1             Longitude     10.629295
In [63]:
# VARIABLES CATEGÒRIQUES
X_train_categorical = X_train.select_dtypes(include=['object', 'category'])
X_train_encoded = pd.get_dummies(X_train_categorical)

k = 4  

selector = SelectKBest(score_func=chi2, k=k)
X_train_selected = selector.fit_transform(X_train_encoded, y_train)

indexs = selector.get_support(indices=True) # obtenim indexs característiques seleccionades
variables = X_train_encoded.columns[indexs] # obtenim noms de les característiques

chi2_scores = selector.scores_ # obtenim valors prova chi-quadrat
variables_chi2_scores = pd.DataFrame({'Variable': X_train_encoded.columns, 'Chi2 Score': chi2_scores}) # creem df
sorted_variable_chi2_scores = variables_chi2_scores.sort_values(by='Chi2 Score', ascending=False) # ordenem descendentment
sorted_variable_names = sorted_variable_chi2_scores['Variable'].values # ordenem els noms 

pd.set_option('display.max_rows', None)

filtre = ["Day_of_Week_","Light_Conditions_","Carriageway_Hazards_","Road_Surface_Conditions_",
          "Road_Type_","Urban_or_Rural_Area_","Weather_Conditions_","Moment_Dia_","Police_Force_",
                              "Junction_Detail_","Junction_Control_","Local_Authority_District_",
                              "Vehicle_Type_"]

chi_quadrat_resultats = {}

for i in filtre:
    filtered_rows = variables_chi2_scores[variables_chi2_scores['Variable'].str.startswith(i)]
    valors = filtered_rows['Chi2 Score'].tolist()
    
    if valors:
        chi_quadrat_resultats[i] = sum(valors) / len(valors)
    else:
        chi_quadrat_resultats[i] = 0

chi_quadrat_resultats = dict(sorted(chi_quadrat_resultats.items(), key = lambda item: item [1], reverse = True))

print("Mitjana del valor Chi2 Score per variable:")
for grup, mean in chi_quadrat_resultats.items():
    print(f"'{grup}': {mean}")
Mitjana del valor Chi2 Score per variable:
'Moment_Dia_': 32527.51260661045
'Day_of_Week_': 21597.251926750083
'Junction_Control_': 20928.113076994767
'Urban_or_Rural_Area_': 20298.407908547226
'Light_Conditions_': 19281.860310474327
'Junction_Detail_': 10966.625724785039
'Vehicle_Type_': 8488.073508875897
'Weather_Conditions_': 7245.564117047502
'Road_Type_': 7200.097374067741
'Road_Surface_Conditions_': 4986.327467088345
'Police_Force_': 4484.9779902606615
'Local_Authority_District_': 2386.0820304830504
'Carriageway_Hazards_': 981.6692056060101

Importància de les variables mitjançant Random Forest:

In [64]:
# Busquem els millors paràmetres per realitzar un Random Forest
X = dades_equilibrades.drop('Accident_Severity', axis=1) 
y = dades_equilibrades['Accident_Severity'] 

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 123)

param_grid = {
    'max_depth': [None, 3 ,5, 7, 10, 15],
    'min_samples_leaf': [1, 2, 5, 10, 20, 50],
    'min_samples_split': [2, 5, 10, 20, 50, 100],
    'bootstrap': [True, False],
    'criterion': ["gini", "entropy", "log_loss"]
}

rf = RandomForestClassifier(random_state = 123)

search = HalvingGridSearchCV(rf, param_grid, cv=5, factor=2)

search.fit(X_train, y_train)


print(search.best_params_)
print(np.mean(search.best_score_))

#bootstrap: False, criterion: 'entropy', max_depth: None, min_samples_leaf: 1, min_samples_split: 2
{'bootstrap': False, 'criterion': 'entropy', 'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2}
0.9378433598737882
In [69]:
# Mirem la importància de cada una de les variables entrenant un Random Forest
X = dades_equilibrades.drop('Accident_Severity', axis=1) 
y = dades_equilibrades['Accident_Severity'] 

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 123)

rf = RandomForestClassifier(random_state = 123)

parametres = {
    'n_estimators': [300],
    'max_depth': [None],
    'min_samples_split': [2],
    'min_samples_leaf': [1],
    'bootstrap': [False],
    'criterion': ['entropy']
}

model_rf = GridSearchCV(estimator = rf, param_grid = parametres, cv = 5, scoring = 'accuracy', n_jobs = -1)
model_rf.fit(X_train, y_train)

millors_parametres = model_rf.best_params_
millor_model = model_rf.best_estimator_
In [70]:
importancies = millor_model.feature_importances_

df_importancies = pd.DataFrame({
    'Variable': X.columns,
    'Importància': importancies
})

df_importancies = df_importancies.sort_values(by='Importància', ascending=False)

filtre = ["Any_Accident", "Number_of_Casualties", "Number_of_Vehicles", "Speed_limit", "Mes_Accident",
          "Dia_Accident", "Day_of_Week_", "Junction_Control_", "Junction_Detail_", "Light_Conditions_","Carriageway_Hazards_",
          "Road_Surface_Conditions_", "Road_Type_","Urban_or_Rural_Area_","Weather_Conditions_", "Vehicle_Type_", 
          "Moment_Dia_","Police_Force_","Local_Authority_District_","Latitude","Longitude"]

mitjanes_importancies = {}

for i in filtre:
    df_prefixos = df_importancies[df_importancies['Variable'].str.startswith(i)]
    mitjana = df_prefixos['Importància'].mean()
    mitjanes_importancies[i] = mitjana

mitjanes_importancies = pd.DataFrame(list(mitjanes_importancies.items()), columns=['Variable', 'Importància'])
#mitjanes_importancies.to_csv('mitjanes_importancies_tot.csv', index=False)
In [75]:
mitjanes_importancies = pd.read_csv('mitjanes_importancies_tot.csv') 
mitjanes_importancies.sort_values(by = "Importància", ascending = False)

dades2 = dades2.drop(columns=['Carriageway_Hazards','Local_Authority_District', 'Vehicle_Type', 'Weather_Conditions',
                              'Junction_Detail', 'Police_Force', 'Road_Type', 'Road_Surface_Conditions'])

var_eliminar = ['Carriageway_Hazards','Local_Authority_District', 'Vehicle_Type', 'Weather_Conditions',
                              'Junction_Detail', 'Police_Force', 'Road_Type', 'Road_Surface_Conditions']

eliminem = [x for x in dades_equilibrades.keys()
                     if any(x.startswith(var) for var in var_eliminar)]
for x in eliminem:
    del dades_equilibrades[x]

5. MÈTRIQUES PER AVALUAR LA CAPACITAT PREDICTIVA¶

6.MODELS PREDICTIUS¶

6.1 REGRESSIÓ LOGÍSTICA MULTINOMIAL¶

In [76]:
# Dades inicials (desbalancejades)
print(dades2['Accident_Severity'].value_counts())

# Tècnica smote aplicada (balancejades)
print(dades_equilibrades['Accident_Severity'].value_counts())
Slight     263272
Serious     40739
Fatal        3953
Name: Accident_Severity, dtype: int64
1    263272
2    263272
0    263272
Name: Accident_Severity, dtype: int64

Base de dades inicial (desequilibri de classes)

In [77]:
no_dummy = dades2['Accident_Severity']
si_dummies = dades2.drop('Accident_Severity', axis=1)  
columnes_dummies = pd.get_dummies(si_dummies)
dades_desequilibrades = pd.concat([no_dummy, columnes_dummies], axis=1)

le = LabelEncoder()
dades_desequilibrades['Accident_Severity'] = le.fit_transform(dades_desequilibrades['Accident_Severity'])
In [78]:
warnings.filterwarnings("ignore")

start_time = time.time()

X = dades_desequilibrades.drop('Accident_Severity', axis=1)
y = dades_desequilibrades['Accident_Severity']

escalar = StandardScaler()
X_escalat = pd.DataFrame(escalar.fit_transform(X))

noms_classes = ["Fatal", "Serious", "Slight"] # 0, 1, 2

model_reglog = LogisticRegression(
    multi_class = 'multinomial',
    solver = 'lbfgs',
    max_iter = 2000
)

cv = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 123)

accuracy_reglog_0 =[]
precisio_reglog_0_3, precisio_reglog_0 = [], []
f1_reglog_0_3, f1_reglog_0 = [], []
sensibilitat_reglog_0_3, sensibilitat_reglog_0 = [], []
balanced_accuracy_reglog_0 = []
mc_reglog_0 = []
especificitat_reglog_0_3, especificitat_reglog_0 = [], []
mae_reglog_0 = []

for train_index, test_index in cv.split(X, y):
    X_train, X_test = X_escalat.iloc[train_index], X_escalat.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
   
    model_reglog.fit(X_train, y_train)
    y_pred_reglog = model_reglog.predict(X_test)
    cm = confusion_matrix(y_test, y_pred_reglog)
    cm_df = pd.DataFrame(cm, index = noms_classes, columns = noms_classes)
   
    # Mètriques
    # Accuracy, Precisió, F1 Score, Sensibilitat
    accuracy_reglog_0.append(accuracy_score(y_test, y_pred_reglog))
    precisio_reglog_0_3.append(precision_score(y_test, y_pred_reglog, average=None))
    precisio_reglog_0.append(precision_score(y_test, y_pred_reglog, average = 'weighted'))
    f1_reglog_0_3.append(f1_score(y_test, y_pred_reglog, average = None))
    f1_reglog_0.append(f1_score(y_test, y_pred_reglog, average = 'weighted'))
    sensibilitat_reglog_0_3.append(recall_score(y_test, y_pred_reglog, average = None))
    sensibilitat_ponderada = recall_score(y_test, y_pred_reglog, average = 'weighted')
    sensibilitat_reglog_0.append(sensibilitat_ponderada)
    mc_reglog_0.append(cm)
   
    # Especificitat
    x = np.diag(cm)

    for i in noms_classes:
        index = noms_classes.index(i)
        TN = cm_df.drop(i, axis = 0).drop(i, axis = 1).values.sum()
        FP = cm_df[i].sum() - cm_df.loc[i, i]
        especificitat = TN / (TN + FP) if (TN + FP) > 0 else 0
        especificitat_reglog_0_3.append(especificitat)
       
    especificitat_ponderada = (especificitat_reglog_0_3[0]*x[0] + especificitat_reglog_0_3[1]*x[1] +
                        especificitat_reglog_0_3[2]*x[2]) / sum(x)
   
    especificitat_reglog_0.append(especificitat_ponderada)
   
    # Balanced Accuracy
    ba = (sensibilitat_ponderada + especificitat_ponderada) / 2
    balanced_accuracy_reglog_0.append(ba)
   
    # MAE
    pesos = np.array([
    [0, 1, 2],
    [1, 0, 1],
    [2, 1, 0]
    ])

    n = np.sum(cm)
    mae = np.sum(cm * pesos) / n
    mae_reglog_0.append(mae)
   
end_time = time.time()
total_time_reglog_0 = round(end_time - start_time,2)
   
# Matriu de confusió
matriu_conf_reglog_0 = np.round(np.mean(mc_reglog_0, axis = 0))
matriu_conf_reglog_0_df = pd.DataFrame(matriu_conf_reglog_0, index=["Fatal", "Serious", "Slight"],
                                   columns=["Fatal", "Serious", "Slight"])
matriu_conf_reglog_0_df = np.round(matriu_conf_reglog_0_df)
plt.rcParams.update({'font.size': 13})
np.set_printoptions(formatter={'float': '{:.0f}'.format})

sns.heatmap(matriu_conf_reglog_0_df, annot=True, cmap='Greens', fmt='g')
plt.xlabel('Classe predita', fontsize = 15)
plt.ylabel('Classe vertadera', fontsize = 15)
plt.xticks(ticks = np.arange(len(noms_classes)), labels = noms_classes, rotation = 0, fontsize = 14)
plt.yticks(ticks = np.arange(len(noms_classes)), labels = noms_classes, rotation = 0, fontsize = 14)
plt.title('Matriu de Confusió', fontsize = 17)
plt.show()     
In [79]:
# mètriques
np.set_printoptions(precision=3)
print(especificitat_reglog_0_3)
print(especificitat_reglog_0)
print(mae_reglog_0)
print(f1_reglog_0_3)
print(precisio_reglog_0_3)
print(balanced_accuracy_reglog_0)
print(sensibilitat_reglog_0)
print(f1_reglog_0)
print(accuracy_reglog_0)
[1.0, 1.0, 0.0, 1.0, 1.0, 0.0, 1.0, 1.0, 0.0, 1.0, 1.0, 0.0, 1.0, 1.0, 0.0]
[0.0, 0.0, 0.0, 0.0, 0.0]
[0.15794002565226567, 0.15797249687464485, 0.15797249687464485, 0.15797249687464485, 0.1579263540719574]
[array([0.   , 0.   , 0.922]), array([0.   , 0.   , 0.922]), array([0.   , 0.   , 0.922]), array([0.   , 0.   , 0.922]), array([0.   , 0.   , 0.922])]
[array([0.   , 0.   , 0.855]), array([0.   , 0.   , 0.855]), array([0.   , 0.   , 0.855]), array([0.   , 0.   , 0.855]), array([0.   , 0.   , 0.855])]
[0.42744305359375256, 0.4274349357881577, 0.4274349357881577, 0.4274349357881577, 0.4274499935056501]
[0.8548861071875051, 0.8548698715763154, 0.8548698715763154, 0.8548698715763154, 0.8548999870113002]
[0.7880055313696184, 0.7879824978857968, 0.7879824978857968, 0.7879824978857968, 0.7880252228256324]
[0.8548861071875051, 0.8548698715763154, 0.8548698715763154, 0.8548698715763154, 0.8548999870113002]

Base de dades (classes balancejades - SMOTE)

In [80]:
start_time = time.time()

X = dades_equilibrades.drop('Accident_Severity', axis=1)
y = dades_equilibrades['Accident_Severity']

escalar = StandardScaler()
X_escalat = pd.DataFrame(escalar.fit_transform(X))

noms_classes = ["Fatal", "Serious", "Slight"] # 0, 1, 2

model_reglog = LogisticRegression(
    multi_class = 'multinomial',
    solver = 'lbfgs',
    max_iter = 2000
)

cv = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 123)

accuracy_reglog =[]
precisio_reglog_3, precisio_reglog = [], []
f1_reglog_3, f1_reglog = [], []
sensibilitat_reglog_3, sensibilitat_reglog = [], []
balanced_accuracy_reglog = []
mc_reglog = []
especificitat_reglog_3, especificitat_reglog = [], []
mae_reglog = []


for train_index, test_index in cv.split(X, y):
    X_train, X_test = X_escalat.iloc[train_index], X_escalat.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
   
    model_reglog.fit(X_train, y_train)
    y_pred_reglog = model_reglog.predict(X_test)
    cm = confusion_matrix(y_test, y_pred_reglog)
    cm_df = pd.DataFrame(cm, index = noms_classes, columns = noms_classes)
   
    # Mètriques
    # Accuracy, Precisió, F1 Score, Sensibilitat
    accuracy_reglog.append(accuracy_score(y_test, y_pred_reglog))
    precisio_reglog_3.append(precision_score(y_test, y_pred_reglog, average=None))
    precisio_reglog.append(precision_score(y_test, y_pred_reglog, average = 'weighted'))
    f1_reglog_3.append(f1_score(y_test, y_pred_reglog, average = None))
    f1_reglog.append(f1_score(y_test, y_pred_reglog, average = 'weighted'))
    sensibilitat_reglog_3.append(recall_score(y_test, y_pred_reglog, average = None))
    sensibilitat_ponderada = recall_score(y_test, y_pred_reglog, average = 'weighted')
    sensibilitat_reglog.append(sensibilitat_ponderada)
    mc_reglog.append(cm)
   
    # Especificitat
    x = np.diag(cm)

    for i in noms_classes:
        index = noms_classes.index(i)
        TN = cm_df.drop(i, axis = 0).drop(i, axis = 1).values.sum()
        FP = cm_df[i].sum() - cm_df.loc[i, i]
        especificitat = TN / (TN + FP) if (TN + FP) > 0 else 0
        especificitat_reglog_3.append(especificitat)
       
    especificitat_ponderada = (especificitat_reglog_3[0]*x[0] + especificitat_reglog_3[1]*x[1] +
                        especificitat_reglog_3[2]*x[2]) / sum(x)
   
    especificitat_reglog.append(especificitat_ponderada)
   
    # Balanced Accuracy
    ba = (sensibilitat_ponderada + especificitat_ponderada) / 2
    balanced_accuracy_reglog.append(ba)
   
    # MAE
    pesos = np.array([
    [0, 1, 2],
    [1, 0, 1],
    [2, 1, 0]
    ])

    n = np.sum(cm)
    mae = np.sum(cm * pesos) / n
    mae_reglog.append(mae)
    
end_time = time.time()
total_time_reglog = round(end_time - start_time,2)
   
# Matriu de confusió
matriu_conf_reglog = np.round(np.mean(mc_reglog, axis = 0))
matriu_conf_reglog_df = pd.DataFrame(matriu_conf_reglog, index=["Fatal", "Serious", "Slight"],
                                   columns=["Fatal", "Serious", "Slight"])
matriu_conf_reglog_df = np.round(matriu_conf_reglog_df)
plt.rcParams.update({'font.size': 13})
np.set_printoptions(formatter={'float': '{:.0f}'.format})

sns.heatmap(matriu_conf_reglog_df, annot=True, cmap='Greens', fmt='g')
plt.xlabel('Classe predita', fontsize = 15)
plt.ylabel('Classe vertadera', fontsize = 15)
plt.xticks(ticks = np.arange(len(noms_classes)), labels = noms_classes, rotation = 0, fontsize = 14)
plt.yticks(ticks = np.arange(len(noms_classes)), labels = noms_classes, rotation = 0, fontsize = 14)
plt.title('Matriu de Confusió', fontsize = 17)
plt.show()
In [81]:
# mètriques
np.set_printoptions(precision=4)
print(especificitat_reglog_3)
print(especificitat_reglog)
print(mae_reglog)
print(f1_reglog_3)
print(precisio_reglog_3)
print(balanced_accuracy_reglog)
print(sensibilitat_reglog)
print(f1_reglog)
print(accuracy_reglog)
[0.8539441073412529, 0.8879108148401371, 0.8312600892602792, 0.8525767028459106, 0.8883180764994113, 0.8315243711363701, 0.8520734220247083, 0.8890313268571537, 0.8321210164469935, 0.853070487802562, 0.8897435166984778, 0.8315702510730428, 0.8521954647320241, 0.8876164430390565, 0.8311730241479836]
[0.8501381565210276, 0.8501358646035232, 0.8501014578998057, 0.8501435258656154, 0.8501062662221353]
[0.3034172343065509, 0.3046029766464299, 0.30327355140127754, 0.303032988737869, 0.30559688028209137]
[array([0.7157, 0.5171, 0.8553]), array([0.7139, 0.5173, 0.8555]), array([0.7158, 0.5158, 0.856 ]), array([0.7163, 0.5185, 0.8557]), array([0.7125, 0.5157, 0.8553])]
[array([0.7114, 0.6557, 0.7475]), array([0.7091, 0.6565, 0.7478]), array([0.7094, 0.6568, 0.7485]), array([0.7107, 0.6595, 0.748 ]), array([0.7081, 0.6543, 0.7475])]
[0.7827739983688929, 0.7825408848286192, 0.7827927318239936, 0.7831999321876332, 0.7820481256093108]
[0.7154098402167582, 0.7149459050537151, 0.7154840057481815, 0.716256338509651, 0.7139899849964865]
[0.6960244653301166, 0.6955544212160086, 0.6958712946965605, 0.6968256467035021, 0.6944906459417279]
[0.7154098402167582, 0.7149459050537151, 0.7154840057481815, 0.716256338509651, 0.7139899849964865]

6.2 RANDOM FOREST¶

In [82]:
# Busquem els millors paràmetres per realitzar un Random Forest
X = dades_equilibrades.drop('Accident_Severity', axis=1) 
y = dades_equilibrades['Accident_Severity'] 

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 123)

param_grid = {
    'max_depth': [None, 3 ,5, 7, 10, 15],
    'min_samples_leaf': [1, 2, 5, 10, 20, 50],
    'min_samples_split': [2, 5, 10, 20, 50, 100],
    'bootstrap': [True, False],
    'criterion': ["gini", "entropy", "log_loss"]
}

rf = RandomForestClassifier(random_state = 123)

search = HalvingGridSearchCV(rf, param_grid, cv = 5, factor = 2,  n_jobs = -1)

search.fit(X_train, y_train)


print(search.best_params_)
print(np.mean(search.best_score_))
#{'bootstrap': False, 'criterion': 'gini', 'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 5}
{'bootstrap': False, 'criterion': 'entropy', 'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 5}
0.928926078707739

Base de dades inicial (desequilibri de classes)

In [83]:
start_time = time.time()

X = dades_desequilibrades.drop('Accident_Severity', axis=1)
y = dades_desequilibrades['Accident_Severity']

noms_classes = ["Fatal", "Serious", "Slight"]

start_time = time.time()

model_rf = RandomForestClassifier(
    n_estimators=300,        
    max_depth=None,          
    min_samples_leaf=1,      
    min_samples_split=5,
    bootstrap=False,
    criterion='gini',
    random_state=123          
)


cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=123)

accuracy_rf_0 = []
precisio_rf_0_3 = []
precisio_rf_0 = []
f1_rf_0_3 = []
f1_rf_0 = []
sensibilitat_rf_0_3 = []
sensibilitat_rf_0 = []
balanced_accuracy_rf_0 = []
mc_rf_0 = []
especificitat_rf_0_3 = []
especificitat_rf_0 = []
mae_rf_0 = []


for train_index, test_index in cv.split(X, y):

    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    model_rf.fit(X_train, y_train)
    y_pred_rf = model_rf.predict(X_test)
    cm = confusion_matrix(y_test, y_pred_rf)
    cm_df = pd.DataFrame(cm, index = noms_classes, columns = noms_classes)

    # Mètriques
    # Accuracy, Precisió, F1 Score, Sensibilitat
    accuracy_rf_0.append(accuracy_score(y_test, y_pred_rf))
    precisio_rf_0_3.append(precision_score(y_test, y_pred_rf, average=None))
    precisio_rf_0.append(precision_score(y_test, y_pred_rf, average = 'weighted'))
    f1_rf_0_3.append(f1_score(y_test, y_pred_rf, average = None))
    f1_rf_0.append(f1_score(y_test, y_pred_rf, average = 'weighted'))
    sensibilitat_rf_0_3.append(recall_score(y_test, y_pred_rf, average = None))
    sensibilitat_ponderada = recall_score(y_test, y_pred_rf, average = 'weighted')
    sensibilitat_rf_0.append(sensibilitat_ponderada)
    mc_rf_0.append(cm)
   
    # Especificitat
    x = np.diag(cm)

    for i in noms_classes:
        index = noms_classes.index(i)
        TN = cm_df.drop(i, axis = 0).drop(i, axis = 1).values.sum()
        FP = cm_df[i].sum() - cm_df.loc[i, i]
        especificitat = TN / (TN + FP) if (TN + FP) > 0 else 0
        especificitat_rf_0_3.append(especificitat)
       
    especificitat_ponderada = (especificitat_rf_0_3[0]*x[0] + especificitat_rf_0_3[1]*x[1] +
                        especificitat_rf_0_3[2]*x[2]) / sum(x)
   
    especificitat_rf_0.append(especificitat_ponderada)


    # Balanced Accuracy
    ba = (sensibilitat_ponderada + especificitat_ponderada) / 2
    balanced_accuracy_rf_0.append(ba)
   
    # MAE
    pesos = np.array([
    [0, 1, 2],
    [1, 0, 1],
    [2, 1, 0]
    ])

    n = np.sum(cm)
    mae = np.sum(cm * pesos) / n
    mae_rf_0.append(mae)
    
end_time = time.time()
total_time_rf_0 = round(end_time - start_time,2)

# Matriu de confusió
matriu_conf_rf_0 = np.round(np.mean(mc_rf_0, axis = 0))
matriu_conf_rf_0_df = pd.DataFrame(matriu_conf_rf_0, index=["Fatal", "Serious", "Slight"],
                                   columns=["Fatal", "Serious", "Slight"])
matriu_conf_rf_0_df = np.round(matriu_conf_rf_0_df)
plt.rcParams.update({'font.size': 13})
np.set_printoptions(formatter={'float': '{:.0f}'.format})

sns.heatmap(matriu_conf_rf_0_df, annot=True, cmap='Greens', fmt='g')
plt.xlabel('Classe predita', fontsize = 15)
plt.ylabel('Classe vertadera', fontsize = 15)
plt.xticks(ticks = np.arange(len(noms_classes)), labels = noms_classes, rotation = 0, fontsize = 14)
plt.yticks(ticks = np.arange(len(noms_classes)), labels = noms_classes, rotation = 0, fontsize = 14)
plt.title('Matriu de Confusió', fontsize = 17)
plt.show()
In [84]:
# mètriques
np.set_printoptions(precision=2)
print(especificitat_rf_0_3)
print(especificitat_rf_0)
print(mae_rf_0)
print(f1_rf_0_3)
print(precisio_rf_0_3)
print(balanced_accuracy_rf_0)
print(sensibilitat_rf_0)
print(accuracy_rf_0)
[0.999736855089387, 0.9929647300963608, 0.01633475050346834, 0.9999342126903721, 0.992160164655253, 0.017451616511914084, 0.999901319035558, 0.9915988399288989, 0.019465264570981095, 0.999868425380744, 0.9916736832257461, 0.01823470186821792, 0.999868425380744, 0.9930957058658434, 0.014993845809555779]
[0.01868273228958722, 0.018796220294279548, 0.019094981029783943, 0.01892759832904299, 0.01862669676558247]
[0.1618690435601448, 0.161901514782524, 0.16216128456155732, 0.1623723475070219, 0.16182296402130147]
[array([0.  , 0.03, 0.92]), array([0.01, 0.03, 0.92]), array([0.01, 0.03, 0.92]), array([0.01, 0.03, 0.92]), array([0.  , 0.03, 0.92])]
[array([0.  , 0.25, 0.86]), array([0.56, 0.23, 0.86]), array([0.33, 0.24, 0.86]), array([0.27, 0.23, 0.86]), array([0.  , 0.25, 0.86])]
[0.4347874395622274, 0.4346655918414881, 0.43470132293091324, 0.4345445713301897, 0.4347906831015859]
[0.8508921468348676, 0.8505349633886967, 0.8503076648320426, 0.8501615443313364, 0.8509546694375894]
[0.8508921468348676, 0.8505349633886967, 0.8503076648320426, 0.8501615443313364, 0.8509546694375894]

Base de dades (classes balancejades - SMOTE)

In [85]:
start_time = time.time()

X = dades_equilibrades.drop('Accident_Severity', axis=1)
y = dades_equilibrades['Accident_Severity']

noms_classes = ["Fatal", "Serious", "Slight"] # 0, 1, 2

model_rf = RandomForestClassifier(
    n_estimators=300,        
    max_depth=None,          
    min_samples_leaf=1,      
    min_samples_split=5,
    bootstrap=False,
    criterion='gini',
    random_state=123            
)

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=123)

accuracy_rf = []
precisio_rf_3 = []
precisio_rf = []
f1_rf_3 = []
f1_rf = []
sensibilitat_rf_3 = []
sensibilitat_rf = []
balanced_accuracy_rf = []
mc_rf = []
especificitat_rf_3 = []
especificitat_rf = []
mae_rf = []

for train_index, test_index in cv.split(X, y):

    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    model_rf.fit(X_train, y_train)
    y_pred_rf = model_rf.predict(X_test)
    cm = confusion_matrix(y_test, y_pred_rf)
    cm_df = pd.DataFrame(cm, index = noms_classes, columns = noms_classes) # diferent cada vegada

    # Mètriques
    # Accuracy, Precisió, F1 Score, Sensibilitat
    accuracy_rf.append(accuracy_score(y_test, y_pred_rf))
    precisio_rf_3.append(precision_score(y_test, y_pred_rf, average=None))
    precisio_rf.append(precision_score(y_test, y_pred_rf, average = 'weighted'))
    f1_rf_3.append(f1_score(y_test, y_pred_rf, average = None))
    f1_rf.append(f1_score(y_test, y_pred_rf, average = 'weighted'))
    sensibilitat_rf_3.append(recall_score(y_test, y_pred_rf, average = None))
    sensibilitat_ponderada = recall_score(y_test, y_pred_rf, average = 'weighted')
    sensibilitat_rf.append(sensibilitat_ponderada)
    mc_rf.append(cm) # anem guardant totes les matrius de confusió
   
    # Especificitat
    x = np.diag(cm)

    for i in noms_classes:
        index = noms_classes.index(i)
        TN = cm_df.drop(i, axis = 0).drop(i, axis = 1).values.sum()
        FP = cm_df[i].sum() - cm_df.loc[i, i]
        especificitat = TN / (TN + FP) if (TN + FP) > 0 else 0
        especificitat_rf_3.append(especificitat)
       
    especificitat_ponderada = (especificitat_rf_3[0]*x[0] + especificitat_rf_3[1]*x[1] +
                        especificitat_rf_3[2]*x[2]) / sum(x)
   
    especificitat_rf.append(especificitat_ponderada)


    # Balanced Accuracy
    ba = (sensibilitat_ponderada + especificitat_ponderada) / 2
    balanced_accuracy_rf.append(ba)
   
    # MAE
    pesos = np.array([
    [0, 1, 2],
    [1, 0, 1],
    [2, 1, 0]
    ])

    n = np.sum(cm)
    mae = np.sum(cm * pesos) / n
    mae_rf.append(mae)
    
end_time = time.time()
total_time_rf = round(end_time - start_time,2)

# Matriu de confusió
matriu_conf_rf = np.round(np.mean(mc_rf, axis = 0))
matriu_conf_rf_df = pd.DataFrame(matriu_conf_rf, index=["Fatal", "Serious", "Slight"],
                                   columns=["Fatal", "Serious", "Slight"])
matriu_conf_rf_df = np.round(matriu_conf_rf_df)
plt.rcParams.update({'font.size': 13})
np.set_printoptions(formatter={'float': '{:.0f}'.format})

sns.heatmap(matriu_conf_rf_df, annot=True, cmap='Greens', fmt='g')
plt.xlabel('Classe predita', fontsize = 15)
plt.ylabel('Classe vertadera', fontsize = 15)
plt.xticks(ticks = np.arange(len(noms_classes)), labels = noms_classes, rotation = 0, fontsize = 14)
plt.yticks(ticks = np.arange(len(noms_classes)), labels = noms_classes, rotation = 0, fontsize = 14)
plt.title('Matriu de Confusió', fontsize = 17)
plt.show()
In [86]:
# mètriques
np.set_printoptions(precision=2)
print(especificitat_rf_3)
print(especificitat_rf)
print(mae_rf)
print(f1_rf_3)
print(precisio_rf_3)
print(balanced_accuracy_rf)
print(sensibilitat_rf)
print(accuracy_rf)
[0.994454415102223, 0.983353749442118, 0.9217453233311177, 0.994511390289529, 0.9850818551297147, 0.9222953403792649, 0.9946253406641408, 0.9833442535775669, 0.9223135944087819, 0.9948437455488135, 0.9839329971797283, 0.9211455919778174, 0.9948152087210849, 0.984094426877095, 0.9216116381315936]
[0.9658848905010445, 0.9658376853638001, 0.9658849017843866, 0.9658581787855628, 0.9658574106236955]
[0.07142766706338154, 0.07009236340155606, 0.07090267974145845, 0.07166235131011693, 0.0708203819881871]
[array([0.99, 0.9 , 0.91]), array([0.99, 0.9 , 0.91]), array([0.99, 0.9 , 0.91]), array([0.99, 0.9 , 0.91]), array([0.99, 0.9 , 0.91])]
[array([0.99, 0.96, 0.86]), array([0.99, 0.97, 0.86]), array([0.99, 0.96, 0.86]), array([0.99, 0.96, 0.86]), array([0.99, 0.96, 0.86])]
[0.9494601328249064, 0.9502149816511524, 0.9497036544651819, 0.9495700116340657, 0.9497690413399049]
[0.9330353751487681, 0.9345922779385046, 0.9335224071459772, 0.9332818444825687, 0.9336806720561144]
[0.9330353751487681, 0.9345922779385046, 0.9335224071459772, 0.9332818444825687, 0.9336806720561144]

6.3 GRADIENT BOOSTING (XGBoost)¶

Base de dades inicial (desequilibri de classes)

In [87]:
# Busquem els millors paràmetres per realitzar un XGBoost
X = dades_equilibrades.drop('Accident_Severity', axis=1) 
y = dades_equilibrades['Accident_Severity'] 

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 123)

param_grid = {
    'max_depth': [None, 3 ,5, 7, 10, 15],
    'learning_rate': [0.001, 0.003, 0.01, 0.03, 0.1, 0.3],
    'subsample': [0.25, 0.5, 1],
    'colsample_bytree': [0.2, 0.4, 0.6, 0.8, 1],
    'learning_rate': [0.01, 0.05, 0.1, 0.2, 0.3],
    'gamma': [0, 0.3, 0.6, 1]
    
}

xgb = XGBClassifier(random_state = 123, objective = 'multi:softmax', n_estimators = 300)

search = HalvingGridSearchCV(xgb, param_grid, cv = 5, factor = 2,  n_jobs = -1)

search.fit(X_train, y_train)


print(search.best_params_)
print(np.mean(search.best_score_))
#{'colsample_bytree': 0.8, 'gamma': 0.6, 'learning_rate': 0.05, 'max_depth': 15, 'subsample': 0.5}
{'colsample_bytree': 1, 'gamma': 0.3, 'learning_rate': 0.1, 'max_depth': 15, 'subsample': 0.5}
0.927631380497316
In [88]:
start_time = time.time()

X = dades_desequilibrades.drop('Accident_Severity', axis=1)
y = dades_desequilibrades['Accident_Severity']

noms_classes = ["Fatal", "Serious", "Slight"]

start_time = time.time()

model_xgb = XGBClassifier(
    objective = 'multi:softmax',
    n_estimators = 300,
    max_depth = 15,
    learning_rate = 0.05,
    subsample = 0.5,
    colsample_bytree = 0.8,
    random_state = 123)

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=123)

accuracy_xgb_0 = []
precisio_xgb_0_3 = []
precisio_xgb_0 = []
f1_xgb_0_3 = []
f1_xgb_0 = []
sensibilitat_xgb_0_3 = []
sensibilitat_xgb_0 = []
balanced_accuracy_xgb_0 = []
mc_xgb_0 = []
especificitat_xgb_0_3 = []
especificitat_xgb_0 = []
mae_xgb_0 = []


for train_index, test_index in cv.split(X, y):

    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    model_xgb.fit(X_train, y_train)
    y_pred_xgb = model_xgb.predict(X_test)
    cm = confusion_matrix(y_test, y_pred_xgb)
    cm_df = pd.DataFrame(cm, index = noms_classes, columns = noms_classes)

    # Mètriques
    # Accuracy, Precisió, F1 Score, Sensibilitat
    accuracy_xgb_0.append(accuracy_score(y_test, y_pred_xgb))
    precisio_xgb_0_3.append(precision_score(y_test, y_pred_xgb, average=None))
    precisio_xgb_0.append(precision_score(y_test, y_pred_xgb, average = 'weighted'))
    f1_xgb_0_3.append(f1_score(y_test, y_pred_xgb, average = None))
    f1_xgb_0.append(f1_score(y_test, y_pred_xgb, average = 'weighted'))
    sensibilitat_xgb_0_3.append(recall_score(y_test, y_pred_xgb, average = None))
    sensibilitat_ponderada = recall_score(y_test, y_pred_xgb, average = 'weighted')
    sensibilitat_xgb_0.append(sensibilitat_ponderada)
    mc_xgb_0.append(cm)
   
    # Especificitat
    x = np.diag(cm)

    for i in noms_classes:
        index = noms_classes.index(i)
        TN = cm_df.drop(i, axis = 0).drop(i, axis = 1).values.sum()
        FP = cm_df[i].sum() - cm_df.loc[i, i]
        especificitat = TN / (TN + FP) if (TN + FP) > 0 else 0
        especificitat_xgb_0_3.append(especificitat)
       
    especificitat_ponderada = (especificitat_xgb_0_3[0]*x[0] + especificitat_xgb_0_3[1]*x[1] +
                        especificitat_xgb_0_3[2]*x[2]) / sum(x)
   
    especificitat_xgb_0.append(especificitat_ponderada)


    # Balanced Accuracy
    ba = (sensibilitat_ponderada + especificitat_ponderada) / 2
    balanced_accuracy_xgb_0.append(ba)
   
    # MAE
    pesos = np.array([
    [0, 1, 2],
    [1, 0, 1],
    [2, 1, 0]
    ])

    n = np.sum(cm)
    mae = np.sum(cm * pesos) / n
    mae_xgb_0.append(mae)
    
end_time = time.time()
total_time_xgb_0 = round(end_time - start_time,2)

# Matriu de confusió
matriu_conf_xgb_0 = np.round(np.mean(mc_xgb_0, axis = 0))
matriu_conf_xgb_0_df = pd.DataFrame(matriu_conf_xgb_0, index=["Fatal", "Serious", "Slight"],
                                   columns=["Fatal", "Serious", "Slight"])
matriu_conf_xgb_0_df = np.round(matriu_conf_xgb_0_df)
plt.rcParams.update({'font.size': 13})
np.set_printoptions(formatter={'float': '{:.0f}'.format})

sns.heatmap(matriu_conf_xgb_0_df, annot=True, cmap='Greens', fmt='g')
plt.xlabel('Classe predita', fontsize = 15)
plt.ylabel('Classe vertadera', fontsize = 15)
plt.xticks(ticks = np.arange(len(noms_classes)), labels = noms_classes, rotation = 0, fontsize = 14)
plt.yticks(ticks = np.arange(len(noms_classes)), labels = noms_classes, rotation = 0, fontsize = 14)
plt.title('Matriu de Confusió', fontsize = 17)
plt.show()
In [89]:
# mètriques
np.set_printoptions(precision=2)
print(especificitat_xgb_0_3)
print(especificitat_xgb_0)
print(mae_xgb_0)
print(f1_xgb_0_3)
print(precisio_xgb_0_3)
print(balanced_accuracy_xgb_0)
print(accuracy_xgb_0)
[0.9998848741016069, 0.994966788287024, 0.018124860147684045, 0.9998519785533371, 0.994779680044906, 0.020360219263899765, 0.9998519785533371, 0.9946112826269997, 0.01857030987806242, 0.999868425380744, 0.9944428852090934, 0.020360219263899765, 0.999868425380744, 0.9946299934512115, 0.018014993845809554]
[0.020803417671398268, 0.020989423985773706, 0.02086059891102506, 0.020971661856198663, 0.020581436067552784]
[0.15938499504813858, 0.1590765184355365, 0.15962852921598233, 0.15933628821456983, 0.15950123392648397]
[array([0.03, 0.03, 0.92]), array([0.05, 0.03, 0.92]), array([0.05, 0.03, 0.92]), array([0.04, 0.03, 0.92]), array([0.04, 0.03, 0.92])]
[array([0.67, 0.33, 0.86]), array([0.68, 0.33, 0.86]), array([0.68, 0.31, 0.86]), array([0.69, 0.31, 0.86]), array([0.67, 0.29, 0.86])]
[0.43691933259164545, 0.4370691603879967, 0.4368018027107526, 0.43692227662809774, 0.4366529079285679]
[0.8530352475118926, 0.8531488967902197, 0.8527430065104801, 0.8528728913999968, 0.852724379789583]

Base de dades (classes balancejades - SMOTE)

In [90]:
start_time = time.time()

X = dades_equilibrades.drop('Accident_Severity', axis=1)
y = dades_equilibrades['Accident_Severity']

noms_classes = ["Fatal", "Serious", "Slight"] # 0, 1, 2

model_xgb = XGBClassifier(
    objective = 'multi:softmax',
    n_estimators = 300,
    max_depth = 15,
    learning_rate = 0.05,
    subsample = 0.5,
    colsample_bytree = 0.8,
    random_state = 123)

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=123)

accuracy_xgb = []
precisio_xgb_3 = []
precisio_xgb = []
f1_xgb_3 = []
f1_xgb = []
sensibilitat_xgb_3 = []
sensibilitat_xgb = []
balanced_accuracy_xgb = []
mc_xgb = []
especificitat_xgb_3 = []
especificitat_xgb = []
mae_xgb = []

for train_index, test_index in cv.split(X, y):

    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    model_xgb.fit(X_train, y_train)
    y_pred_xgb = model_xgb.predict(X_test)
    cm = confusion_matrix(y_test, y_pred_xgb)
    cm_df = pd.DataFrame(cm, index = noms_classes, columns = noms_classes) # diferent cada vegada

    # Mètriques
    # Accuracy, Precisió, F1 Score, Sensibilitat
    accuracy_xgb.append(accuracy_score(y_test, y_pred_xgb))
    precisio_xgb_3.append(precision_score(y_test, y_pred_xgb, average=None))
    precisio_xgb.append(precision_score(y_test, y_pred_xgb, average = 'weighted'))
    f1_xgb_3.append(f1_score(y_test, y_pred_xgb, average = None))
    f1_xgb.append(f1_score(y_test, y_pred_xgb, average = 'weighted'))
    sensibilitat_xgb_3.append(recall_score(y_test, y_pred_xgb, average = None))
    sensibilitat_ponderada = recall_score(y_test, y_pred_xgb, average = 'weighted')
    sensibilitat_xgb.append(sensibilitat_ponderada)
    mc_xgb.append(cm) # anem guardant totes les matrius de confusió
   
    # Especificitat
    x = np.diag(cm)

    for i in noms_classes:
        index = noms_classes.index(i)
        TN = cm_df.drop(i, axis = 0).drop(i, axis = 1).values.sum()
        FP = cm_df[i].sum() - cm_df.loc[i, i]
        especificitat = TN / (TN + FP) if (TN + FP) > 0 else 0
        especificitat_xgb_3.append(especificitat)
       
    especificitat_ponderada = (especificitat_xgb_3[0]*x[0] + especificitat_xgb_3[1]*x[1] +
                        especificitat_xgb_3[2]*x[2]) / sum(x)
   
    especificitat_xgb.append(especificitat_ponderada)


    # Balanced Accuracy
    ba = (sensibilitat_ponderada + especificitat_ponderada) / 2
    balanced_accuracy_xgb.append(ba)
   
    # MAE
    pesos = np.array([
    [0, 1, 2],
    [1, 0, 1],
    [2, 1, 0]
    ])

    n = np.sum(cm)
    mae = np.sum(cm * pesos) / n
    mae_xgb.append(mae)
    
end_time = time.time()
total_time_xgb = round(end_time - start_time,2)

# Matriu de confusió
matriu_conf_xgb = np.round(np.mean(mc_xgb, axis = 0))
matriu_conf_xgb_df = pd.DataFrame(matriu_conf_xgb, index=["Fatal", "Serious", "Slight"],
                                   columns=["Fatal", "Serious", "Slight"])
matriu_conf_xgb_df = np.round(matriu_conf_xgb_df)
plt.rcParams.update({'font.size': 13})
np.set_printoptions(formatter={'float': '{:.0f}'.format})

sns.heatmap(matriu_conf_xgb_df, annot=True, cmap='Greens', fmt='g')
plt.xlabel('Classe predita', fontsize = 15)
plt.ylabel('Classe vertadera', fontsize = 15)
plt.xticks(ticks = np.arange(len(noms_classes)), labels = noms_classes, rotation = 0, fontsize = 14)
plt.yticks(ticks = np.arange(len(noms_classes)), labels = noms_classes, rotation = 0, fontsize = 14)
plt.title('Matriu de Confusió', fontsize = 17)
plt.show()
In [91]:
np.set_printoptions(precision=2)
print(especificitat_xgb_3)
print(especificitat_xgb)
print(mae_xgb)
print(f1_xgb_3)
print(precisio_xgb_3)
print(balanced_accuracy_xgb)
print(accuracy_xgb)
print(f1_xgb)
[0.9801821306820879, 0.9899628711696057, 0.9138258474978634, 0.9817964276557559, 0.9896873931705094, 0.9144992355829037, 0.9812836509700026, 0.9890322764436088, 0.9142515288487104, 0.9818344111139599, 0.9898774083886468, 0.9143180005317735, 0.9814544004254188, 0.9891557226827716, 0.9136541036378657]
[0.9594405568324412, 0.9594749433028715, 0.9594705476612042, 0.9594847883183295, 0.959471516116576]
[0.08165784609151452, 0.0805378474706102, 0.08117090711115894, 0.0807404265555858, 0.08162037945594855]
[array([0.97, 0.88, 0.91]), array([0.97, 0.88, 0.91]), array([0.97, 0.88, 0.91]), array([0.97, 0.88, 0.91]), array([0.97, 0.88, 0.91])]
[array([0.96, 0.98, 0.85]), array([0.96, 0.98, 0.85]), array([0.96, 0.97, 0.85]), array([0.96, 0.98, 0.85]), array([0.96, 0.97, 0.85])]
[0.941043744522422, 0.9417317361310924, 0.9412579088780499, 0.9417524851298351, 0.941157103563248]
[0.9226469322124028, 0.9239885289593133, 0.9230452700948957, 0.9240201819413407, 0.92284269100992]
[0.9214394709622198, 0.922923756117588, 0.9219152443294609, 0.9229769897730074, 0.9217343274056785]

7. RESULTATS¶

In [92]:
# random forest 
np.set_printoptions(precision=2) 

metriques = pd.DataFrame({
    'Model': ['Random Forest (Smote)', 'Random Forest (No Smote)'],
    'Accuracy': [round(np.mean(accuracy_rf),3), round(np.mean(accuracy_rf_0),3)],
    'Balanced Accuracy': [round(np.mean(balanced_accuracy_rf),3), round(np.mean(balanced_accuracy_rf_0),3)],
    'Sensibilitat': [round(np.mean(sensibilitat_rf),3), round(np.mean(sensibilitat_rf_0),3)],
    'Especificitat': [round(np.mean(especificitat_rf),3), round(np.mean(especificitat_rf_0),3)],
    'Precisió': [round(np.mean(precisio_rf),3), round(np.mean(precisio_rf_0),3)],
    'F1 Score': [round(np.mean(f1_rf),3), round(np.mean(f1_rf_0),3)],
    'MAE': [round(np.mean(mae_rf),3), round(np.mean(mae_rf_0),3)],
    'Temps transcorregut': [round(total_time_rf/60,3), round(total_time_rf_0/60,3)]
})

metriques
Out[92]:
Model Accuracy Balanced Accuracy Sensibilitat Especificitat Precisió F1 Score MAE Temps transcorregut
0 Random Forest (Smote) 0.934 0.950 0.934 0.966 0.938 0.933 0.071 65.090
1 Random Forest (No Smote) 0.851 0.435 0.851 0.019 0.767 0.790 0.162 15.275
In [93]:
# només regressió logistica multinomial
np.set_printoptions(precision=2) 

metriques = pd.DataFrame({
    'Model': ['Regressió Logística Multinomial (Smote)', 'Regressió Logística Multinomial (No Smote)'],
    'Accuracy': [round(np.mean(accuracy_reglog),3), round(np.mean(accuracy_reglog_0),3)],
    'Balanced Accuracy': [round(np.mean(balanced_accuracy_reglog),3), round(np.mean(balanced_accuracy_reglog_0),3)],
    'Sensibilitat': [round(np.mean(sensibilitat_reglog),3), round(np.mean(sensibilitat_reglog_0),3)],
    'Especificitat': [round(np.mean(especificitat_reglog),3), round(np.mean(especificitat_reglog_0),3)],
    'Precisió': [round(np.mean(precisio_reglog),3), round(np.mean(precisio_reglog_0),3)],
    'F1 Score': [round(np.mean(f1_reglog),3), round(np.mean(f1_reglog_0),3)],
    'MAE': [round(np.mean(mae_reglog),3), round(np.mean(mae_reglog_0),3)],
    'Temps transcorregut': [round(total_time_reglog/60,3), round(total_time_reglog_0/60,3)]
})

metriques
Out[93]:
Model Accuracy Balanced Accuracy Sensibilitat Especificitat Precisió F1 Score MAE Temps transcorregut
0 Regressió Logística Multinomial (Smote) 0.715 0.783 0.715 0.85 0.705 0.696 0.304 2.307
1 Regressió Logística Multinomial (No Smote) 0.855 0.427 0.855 0.00 0.731 0.788 0.158 0.229
In [94]:
# només regressió XGB
np.set_printoptions(precision=2)

metriques = pd.DataFrame({
    'Model': ['XGBoost (Smote)', 'XGBoost (No Smote)'],
    'Accuracy': [round(np.mean(accuracy_xgb),3), round(np.mean(accuracy_xgb_0),3)],
    'Balanced Accuracy': [round(np.mean(balanced_accuracy_xgb),3), round(np.mean(balanced_accuracy_xgb_0),3)],
    'Sensibilitat': [round(np.mean(sensibilitat_xgb),3), round(np.mean(sensibilitat_xgb_0),3)],
    'Especificitat': [round(np.mean(especificitat_xgb),3), round(np.mean(especificitat_xgb_0),3)],
    'Precisió': [round(np.mean(precisio_xgb),3), round(np.mean(precisio_xgb_0),3)],
    'F1 Score': [round(np.mean(f1_xgb),3), round(np.mean(f1_xgb_0),3)],
    'MAE': [round(np.mean(mae_xgb),3), round(np.mean(mae_xgb_0),3)],
    'Temps transcorregut': [round(total_time_xgb/60,3), round(total_time_xgb_0/60,3)]
})

metriques
Out[94]:
Model Accuracy Balanced Accuracy Sensibilitat Especificitat Precisió F1 Score MAE Temps transcorregut
0 XGBoost (Smote) 0.923 0.941 0.923 0.959 0.930 0.922 0.081 8.598
1 XGBoost (No Smote) 0.853 0.437 0.853 0.021 0.782 0.792 0.159 3.270
In [115]:
# junt

metriques_smote = pd.DataFrame({
    'Model': ['Regressió Logística Multinomial','Random Forest','XGBoost'],
    'Accuracy': [round(np.mean(accuracy_reglog),3), round(np.mean(accuracy_rf),3), round(np.mean(accuracy_xgb),3)],
    'Balanced Accuracy': [round(np.mean(balanced_accuracy_reglog),3), round(np.mean(balanced_accuracy_rf),3),
                          round(np.mean(balanced_accuracy_xgb),3)],
    'Sensibilitat': [round(np.mean(sensibilitat_reglog),3), round(np.mean(sensibilitat_rf),3),
                     round(np.mean(sensibilitat_xgb),3)],
    'Especificitat': [round(np.mean(especificitat_reglog),3), round(np.mean(especificitat_rf),3),
                      round(np.mean(especificitat_xgb),3)],
    'Precisió': [round(np.mean(precisio_reglog),3), round(np.mean(precisio_rf),3), round(np.mean(precisio_xgb),3)],
    'F1 Score': [round(np.mean(f1_reglog),3), round(np.mean(f1_rf),3), round(np.mean(f1_xgb),3)],
    'MAE': [round(np.mean(mae_reglog),3), round(np.mean(mae_rf),3), round(np.mean(mae_xgb),3)],
    'Temps transcorregut': [round(total_time_reglog/60,3), round(total_time_rf/60,3), round(total_time_xgb/60,3), ]
})


metriques_no_smote = pd.DataFrame({
    'Model': ['Regressió Logística Multinomial','Random Forest', 'XGBoost'],
    'Accuracy': [round(np.mean(accuracy_reglog_0),3), round(np.mean(accuracy_rf_0),3), round(np.mean(accuracy_xgb_0),3)],
    'Balanced Accuracy': [ round(np.mean(balanced_accuracy_reglog_0),3), round(np.mean(balanced_accuracy_rf_0),3),
                          round(np.mean(balanced_accuracy_xgb_0),3)],
    'Sensibilitat': [round(np.mean(sensibilitat_reglog_0),3), round(np.mean(sensibilitat_rf_0),3),
                     round(np.mean(sensibilitat_xgb_0),3)],
    'Especificitat': [round(np.mean(especificitat_reglog_0),3), round(np.mean(especificitat_rf_0),3),
                      round(np.mean(especificitat_xgb_0),3)],
    'Precisió': [round(np.mean(precisio_reglog_0),3), round(np.mean(precisio_rf_0),3), round(np.mean(precisio_xgb_0),3)],
    'F1 Score': [round(np.mean(f1_reglog_0),3), round(np.mean(f1_rf_0),3), round(np.mean(f1_xgb_0),3)],
    'MAE': [round(np.mean(mae_reglog_0),3), round(np.mean(mae_rf_0),3), round(np.mean(mae_xgb_0),3)],
    'Temps transcorregut': [round(total_time_reglog_0/60,3), round(total_time_rf_0/60,3), round(total_time_xgb_0/60,3)]
})

#metriques_smote
#metriques_no_smote
In [117]:
colors = {
    'Accuracy': '#006400',
    'Balanced Accuracy': '#008000',
    'Sensibilitat': '#228B22',
    'Especificitat': '#3CB371',
    'Precisió': '#32CD32',
    'F1 Score': '#7CFC00',
    'MAE': '#90EE90',
    'Temps transcorregut': '#90EE90'
}
metriques_1 = metriques_no_smote.iloc[:, :-2]
metriques_1 = metriques_1.melt(id_vars = 'Model', var_name = 'Mètrica', value_name = 'Valor')

plt.figure(figsize = (9, 5))
sns.barplot(data=metriques_1, x='Model', y='Valor', hue='Mètrica', palette=colors)
plt.xlabel("")
plt.ylabel("")
plt.title('COMPARACIÓ DE MÈTRIQUES ENTRE ELS MODELS (No Smote)', fontsize = 16)
plt.legend(title = 'Mètriques', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.xticks(rotation = 45)
plt.tight_layout()
plt.show()

metriques_2 = metriques_smote.iloc[:, :-2]
metriques_2 = metriques_2.melt(id_vars = 'Model', var_name = 'Mètrica', value_name = 'Valor')
plt.figure(figsize = (9, 5))
sns.barplot(data=metriques_2, x='Model', y='Valor', hue='Mètrica', palette=colors)

plt.xlabel("")
plt.ylabel("")
plt.title('COMPARACIÓ DE MÈTRIQUES ENTRE ELS MODELS (Smote)', fontsize = 16)
plt.legend(title = 'Mètriques', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.xticks(rotation = 45)
plt.tight_layout()
plt.show()
In [133]:
colors1 = {
    'Regressió Logística Multinomial': '#006400',
    'Random Forest': '#32CD32',
    'XGBoost': '#7CFC00',
}

#dades inicials
plt.figure(figsize = (15, 8))
sns.barplot(data=metriques_1, x='Mètrica', y='Valor', hue='Model', palette=colors1)

plt.xlabel("")
plt.ylabel("")
plt.title('COMPARACIÓ DE MÈTRIQUES ENTRE ELS DIFERENTS\nMODELS (No Smote)', fontsize = 20)
plt.legend(title = 'Models', bbox_to_anchor=(1, 1), loc='upper left', fontsize = 16, title_fontsize = 16)
plt.xticks(rotation = 45, fontsize = 16)
plt.yticks(fontsize = 16)
plt.tight_layout()
plt.show()

#dades equilibrades
plt.figure(figsize = (15, 8))
sns.barplot(data=metriques_2, x='Mètrica', y='Valor', hue='Model', palette=colors1)

plt.xlabel("")
plt.ylabel("")
plt.title('COMPARACIÓ DE MÈTRIQUES ENTRE ELS DIFERENTS\nMODELS PREDICTIUS', fontsize = 20)
plt.legend(title = 'Models', bbox_to_anchor=(1, 1), loc='upper left', fontsize = 16, title_fontsize = 16)
plt.xticks(rotation = 45, fontsize = 16)
plt.yticks(fontsize = 16)
plt.tight_layout()
plt.show()
In [132]:
metriques_2 = metriques_smote.iloc[:, :-1]
metriques_2 = metriques_2.melt(id_vars = 'Model', var_name = 'Mètrica', value_name = 'Valor')
colors2 = {
    'Accuracy': '#006400',
    'Balanced Accuracy': '#008000',
    'Sensibilitat': '#228B22',
    'Especificitat': '#3CB371',
    'Precisió': '#32CD32',
    'F1 Score': '#7CFC00',
    'MAE': '#90EE90',
    'Temps transcorregut': '#90EE90'
}

noms_metriques = ['Accuracy', 'Balanced Accuracy', 'Sensibilitat', 'Especificitat', 'Precisió', 'F1 Score', 'MAE']

for metrica in noms_metriques:
    plt.figure(figsize = (9, 5))
    ax = sns.barplot(data=metriques_2.loc[metriques_2['Mètrica'] == metrica], x='Model', y='Valor', hue='Mètrica', palette=colors2)
    
    for p in ax.patches:
        if p.get_height() > 0:
            ax.annotate('{:.2f}'.format(p.get_height()), 
                        (p.get_x() + p.get_width() / 2., p.get_height()), 
                        ha='center', va='bottom', fontsize=10, color='black', xytext=(0, 5), 
                        textcoords='offset points')
    plt.gca().set_ylim([0, 1.03])
    plt.xlabel("")
    plt.ylabel("")
    plt.title(f'{metrica}') 
    plt.legend().remove()
    plt.xticks(rotation = 0)
    plt.show()
In [131]:
mae = metriques_smote.iloc[:, [0, 7]]
mae = mae.melt(id_vars = 'Model', var_name = 'MAE', value_name = 'Valor')

plt.figure(figsize = (12, 5))
sns.barplot(data=mae, x='MAE', y='Valor', hue='Model', palette=colors1)

for p in plt.gca().patches:
    if p.get_height() > 0:  
        plt.gca().annotate('{}'.format(round(p.get_height(), 2)), 
                            (p.get_x() + p.get_width() / 2., p.get_height()), 
                            ha='center', va='bottom', fontsize=14, color='black', xytext=(0, 5), 
                            textcoords='offset points')
plt.gca().set_ylim([0, 40])
plt.xlabel("")
plt.ylabel("")
plt.title('COMPARACIÓ DEL MAE ENTRE ELS DIFERENTS\nMODELS PREDICTIUS', fontsize = 17)
plt.legend(title = 'Models', bbox_to_anchor=(1, 1), loc='upper left', fontsize = 14, title_fontsize = 14)
plt.xticks(rotation = 0, fontsize = 15)
plt.yticks(fontsize = 15)
plt.ylim(0,1)
plt.tight_layout()
plt.show()
In [99]:
temps = metriques_smote.iloc[:, [0, -1]]
temps = temps.melt(id_vars = 'Model', var_name = 'Eficiència', value_name = 'Valor')

plt.figure(figsize = (12, 5))
sns.barplot(data=temps, x='Eficiència', y='Valor', hue='Model', palette=colors1)

for p in plt.gca().patches:
    if p.get_height() > 0.1:  
        plt.gca().annotate('{}'.format(round(p.get_height(), 2)), 
                            (p.get_x() + p.get_width() / 2., p.get_height()), 
                            ha='center', va='bottom', fontsize=14, color='black', xytext=(0, 5), 
                            textcoords='offset points')
plt.gca().set_ylim([0, 40])
plt.xlabel("")
plt.ylabel("")
plt.title('COMPARACIÓ DE L\'EFICIÈNCIA ENTRE ELS DIFERENTS\nMODELS PREDICTIUS', fontsize = 17)
plt.legend(title = 'Models', bbox_to_anchor=(1, 1), loc='upper left', fontsize = 14, title_fontsize = 14)
plt.xticks(rotation = 0, fontsize = 15)
plt.yticks(fontsize = 15)
plt.ylim(0,80)
plt.tight_layout()
plt.show()

8.CONCLUSIONS¶